o
    –h»  ã                   @   s¤   d Z ddlZddlZddlZddlmZ ddlmZ ddlm	Z	 ej
jej
jgZdZdZdZd	Zd
ZdZdZdZdd„ Zd ¡ Zdd„ Zdd„ Zdd„ Zdd„ ZdS )z­
A few tests for Vietnamese parsing, which has some difficulties related to spaces in words

Technically some other languages can have this, too, like that one French token
é    N)Úpretrain)Útree_reader)Úbuild_traineruŒ   (ROOT (S-TTL (NP (" ") (N-H Äáº£o) (Np ÄÃ i Loan) (" ") (PP (E-H á»Ÿ) (NP (N-H Ä‘á»“ng báº±ng) (NP (N-H sÃ´ng) (Np Cá»­u Long))))) (. .)))uŒ   (ROOT (S-TTL (NP (" ") (N-H Äáº£o) (Np ÄÃ i_Loan) (" ") (PP (E-H á»Ÿ) (NP (N-H Ä‘á»“ng_báº±ng) (NP (N-H sÃ´ng) (Np Cá»­u_Long))))) (. .)))u€   (ROOT (S (NP (" ") (N Äáº£o) (Np ÄÃ i Loan) (" ") (PP (E á»Ÿ) (NP (N Ä‘á»“ng báº±ng) (NP (N sÃ´ng) (Np Cá»­u Long))))) (. .)))uœ   (ROOT (S-TTL (NP (PUNCT -LRB-) (N-H Äáº£o) (Np ÄÃ i Loan) (PUNCT -RRB-) (PP (E-H á»Ÿ) (NP (N-H Ä‘á»“ng báº±ng) (NP (N-H sÃ´ng) (Np Cá»­u Long))))) (. .)))uœ   <s>
(S-TTL (NP (PUNCT LBKT) (N-H Äáº£o) (Np ÄÃ i_Loan) (PUNCT RBKT) (PP (E-H á»Ÿ) (NP (N-H Ä‘á»“ng_báº±ng) (NP (N-H sÃ´ng) (Np Cá»­u_Long))))) (. .))
</s>u¢   <s id=50>
(S-TTL (NP (PUNCT LBKT) (N-H Äáº£o) (Np ÄÃ i_Loan) (PUNCT RBKT) (PP (E-H á»Ÿ) (NP (N-H Ä‘á»“ng_báº±ng) (NP (N-H sÃ´ng) (Np Cá»­u_Long))))) (. .))
</s>u£   <s id=100>
(S-TTL (NP (PUNCT LBKT) (N-H Äáº£o) (Np ÄÃ i_Loan) (PUNCT RBKT) (PP (E-H á»Ÿ) (NP (N-H Ä‘á»“ng_báº±ng) (NP (N-H sÃ´ng) (Np Cá»­u_Long))))) (. .))
</s>uÅ   (_ROOT (_S (_NP (_" " )_" (_N Äáº£o )_N (_Np ÄÃ i_Loan )_Np (_" " )_" (_PP (_E á»Ÿ )_E (_NP (_N Ä‘á»“ng_báº±ng )_N (_NP (_N sÃ´ng )_N (_Np Cá»­u_Long )_Np )_NP )_NP )_PP )_NP (_. . )_. )_S )_ROOTc                  C   sz   t  d¡d } t | ¡}t|ƒdksJ ‚t|d ƒ| ksJ ‚|d jd jd jd }| ¡ s1J ‚|jd jdks;J ‚dS )z`
    Test that an individual tree with spaces in the leaves is being processed as we expect
    Ú
r   é   é   u
   ÄÃ i LoanN)	ÚVI_TREEBANKÚsplitr   Ú
read_treesÚlenÚstrÚchildrenÚis_preterminalÚlabel)ÚtextÚtreesÚnode© r   úd/var/www/html/env_mimamsha/lib/python3.10/site-packages/stanza/tests/constituency/test_vietnamese.pyÚtest_read_vi_tree!   s   
r   u”   
4 4
Äáº£o          0.11 0.21 0.31 0.41
ÄÃ i Loan     0.12 0.22 0.32 0.42
Ä‘á»“ng báº±ng    0.13 0.23 0.33 0.43
sÃ´ng         0.14 0.24 0.34 0.44
c            
   	   C   sà   t  d¡d } t | ¡}t|d  ¡ ƒ}t ¡ E}tj	 
|d¡}tj	 
|d¡}t|ddd}| t¡ W d  ƒ n1 s>w   Y  tj||d	d
}| ¡  t|ƒ}|j}	W d  ƒ n1 s`w   Y  |	 |¡dksnJ ‚dS )zT
    Test that a VI embedding's words are correctly found when processing trees
    r   r   zemb.txtzemb.ptÚwzutf-8)ÚencodingNT)ÚfilenameÚvec_filenameÚsave_to_fileé   )r   r	   r   r
   ÚsetÚleaf_labelsÚtempfileÚTemporaryDirectoryÚosÚpathÚjoinÚopenÚwriteÚVI_EMBEDDINGr   ÚPretrainÚloadr   ÚmodelÚnum_words_known)
r   r   ÚwordsÚtempdirÚemb_filenameÚpt_filenameÚfoutÚptÚtrainerr(   r   r   r   Útest_vi_embedding7   s   

ÿ÷r1   c                  C   sl   t  d¡d } t | ¡}t|ƒdksJ ‚t|d ƒ| ksJ ‚d |d ¡t ks)J ‚d |d ¡tks4J ‚dS )z^
    By default, spaces are left as spaces, but there is a format option to change spaces
    r   r   r   z{}z{:_O}N)r   r	   r   r
   r   r   ÚformatÚVI_TREEBANK_UNDERSCORE©r   r   r   r   r   Útest_space_formattingM   s   
r5   c                  C   s  t  d¡d } t | ¡}t|ƒdksJ ‚t|d ƒ| ksJ ‚d |d ¡tks)J ‚d|d _d |d ¡t	ks9J ‚d|d _d |d ¡t
ksIJ ‚t d¡d }t t¡ d	 |¡ W d   ƒ n1 sew   Y  t d
¡d }t t¡ d	 |¡ W d   ƒ d S 1 s‡w   Y  d S )Nr   r   r   z{:_V}é2   z{:_Vi}éd   z(ROOT)z{:V}z(ROOT (1) (2) (3)))ÚVI_TREEBANK_PARENr	   r   r
   r   r   r2   ÚVI_TREEBANK_VLSPÚtree_idÚVI_TREEBANK_VLSP_50ÚVI_TREEBANK_VLSP_100ÚpytestÚraisesÚ
ValueError)r   r   ÚemptyÚbranchesr   r   r   Útest_vlsp_formattingY   s"   


ÿ"ÿrB   c                  C   sh   t  d¡d } t | ¡}dd„ |D ƒ}t|ƒdksJ ‚t|d ƒtks%J ‚d |d ¡} | tks2J ‚dS )z?
    Test turning the parse tree into a 'language' for GPT
    r   r   c                 S   s   g | ]}|  ¡  ¡ ‘qS r   )Ú
prune_noneÚsimplify_labels)Ú.0Útr   r   r   Ú
<listcomp>s   s    z,test_language_formatting.<locals>.<listcomp>r   z{:L}N)	r   r	   r   r
   r   r   ÚVI_TREEBANK_SIMPLEr2   ÚEXPECTED_LABELED_BRACKETSr4   r   r   r   Útest_language_formattingm   s   
rJ   )Ú__doc__r    r   r=   Ústanza.models.commonr   Ústanza.models.constituencyr   Ú&stanza.tests.constituency.test_trainerr   ÚmarkÚpipelineÚtravisÚ
pytestmarkr   r3   rH   r8   r9   r;   r<   rI   r   Ústripr%   r1   r5   rB   rJ   r   r   r   r   Ú<module>   s0    ú