o
    ht&                     @   s~  d Z ddlZddlZddlZddlZddlmZ ddlT ddlm	Z	m
Z
 ejjejjgZdd ZdZd	Zd
ZdZdddddZdd Zejdddd Zejdddd Zg dg dgZg dgZg dgZdd Zg dg dgZejg d ejd!ejg d"ejd!gZ g d#gZ!ejg d$ejd!gZ"g d%gZ#ejg d&ejd!gZ$d'd( Z%d)d* Z&d+d, Z'd-d. Z(d/d0 Z)dS )1z
Very simple test of the mwt counting functionality in tokenization/data.py

TODO: could add a bunch more simple tests, including tests of reading
the data from a temp file, for example
    N)Pipeline)*)
DataLoader
NUMERIC_REc                 C   s   t jdd| dd}|j}|| W d   n1 sw   Y  t jdd| dd}|j}|| W d   ||fS 1 s@w   Y  ||fS )z
    Writes raw_text and labels to randomly named files in test_dir

    Note that the tempfiles are not set to automatically clean up.
    This will not be a problem if you put them in a tempdir.
    wzutf-8F)modeencodingdirdeleteN)tempfileNamedTemporaryFilenamewrite)test_dirraw_textlabelsfouttxt_file
label_file r   g/var/www/html/env_mimamsha/lib/python3.10/site-packages/stanza/tests/tokenization/test_tokenize_data.pywrite_tokenizer_input   s   
r   z2Sehr gute Beratung, schnelle Behebung der Probleme200010000100000000110000000010000000010001000000002z* Die Kosten sind definitiv auch im Rahmen.*000100000010000100000000010000100300000012de)space_beforecapitalizedi,  F)lang
feat_funcs
max_seqlenuse_dictionaryc                  C   s   t jtd7} t| tt\}}tt||dd}| rJ t| t	t
\}}tt||dd}| s5J W d   dS 1 s@w   Y  dS )z0
    One dataset has no mwt, the other does
    r	   txtlabel)argsinput_filesN)r   TemporaryDirectoryTEST_WORKING_DIRr   NO_MWT_TEXTNO_MWT_LABELSr   FAKE_PROPERTIEShas_mwtMWT_TEXT
MWT_LABELS)r   r   r   datar   r   r   test_has_mwt4   s   "r0   module)scopec                  C      t dtd dd} | jd }|S )Nentokenizer	   download_method
processorsr   TEST_MODELS_DIRr8   pipeline	tokenizerr   r   r   r=   A      
r=   c                  C   r3   )Nzzh-hansr5   r6   r9   r;   r   r   r   zhtokG   r>   r?   )Tr   hr   ir   sr    r   rD   rF   rH   ar   rH   tr   er   rF   rL   )fr   or   rR   )r@   rB   rD   rF   rH   rD   rF   rH   rJ   rH   rL   rN   rF   rL   rH   rP   rR   rR   )r@   rB   rD   rF   rH   rD   rF   rH   rJ   rH   rL   rN   rF   rL   rP   rR   rR   c                 C   s   d}t | j|| jd| jjd}|jtksJ d}t | j|| jd| jjd}|jtks,J t| j}d|d< t ||| jd| jjd}|jt	ksHJ dS )z;
    Tests converting a couple small segments to units
    zThis is a      test

fooT
input_textvocab
evaluation
dictionaryzThis is a      test
fooskip_newlineN)
r   configrV   trainerrX   r/   EXPECTED_TWO_NL_RAWEXPECTED_ONE_NL_RAWdictEXPECTED_SKIP_NL_RAW)r=   r   batchesskip_newline_configr   r   r   test_convert_units_raw_textR   s   
rb   )r@   rB   rD   rF   rH   rD   rF   rH   rJ   rH   rL   rN   rF   rL   .   )r   r   r   r   r   r   r   r   r   r   r   r   r   r   re   )dtype)r   r   r   )r@   rB   rD   rF   rH   rD   rF   rH   rJ   rH   rL   rN   rF   rL   rc   rH   rP   rR   rR   )r   r   r   r   r   r   r   r   r   r   r   r   r   r   re   r   r   r   r   )r@   rB   rD   rF   rH   rD   rF   rH   rJ   rH   rL   rN   rF   rL   rc   rP   rR   rR   )r   r   r   r   r   r   r   r   r   r   r   r   r   r   re   r   r   r   c                 C   s<   t | t |ks
J t| |D ]\}}t||sJ qd S )N)lenzipnparray_equiv)r   expected_labelsr$   expectedr   r   r   check_labelso   s   rm   c                 C   s>  t jtd}d}d}t|||\}}t| j||d| jd| jjd}|j	t
ks*J t| t d}d}t|||\}}t| j||d| jd| jjd}|j	tksTJ t| t t| j}d|d	< d}d}t|||\}}t|||d| jd| jjd}|j	tksJ t| t W d
   d
S 1 sw   Y  d
S )zJ
    Tests reading some text from a file and converting that to units
    r!   z00000000000000000001

000

zThis is a      test.

foo

r"   T)r&   rV   rW   rX   z000000000000000000010000

zThis is a      test.
foo

rY   N)r   r'   r(   r   r   rZ   rV   r[   rX   r/   EXPECTED_TWO_NL_FILErm   r   EXPECTED_TWO_NL_FILE_LABELSEXPECTED_ONE_NL_FILEEXPECTED_ONE_NL_FILE_LABELSr^   EXPECTED_SKIP_NL_FILEEXPECTED_SKIP_NL_FILE_LABELS)r=   r   r   r   r   r   r`   ra   r   r   r   test_convert_units_filet   s,     
"rt   c                 C   s   | j jdusJ | j jdusJ d| j jv sJ d| j jv s J d| j jd v s*J d| j jd v s4J d| j jd v s>J d	| j jd v sHJ dS )
z
    Tests some features of the zh tokenizer dictionary

    The expectation is that the Chinese tokenizer will be serialized with a dictionary
    (if it ever gets serialized without, this test will warn us!)
    Nu   老师u	   蛋白质u   蛋白prefixesu   蛋u   白质suffixesu   质)r[   lexiconrX   )r?   r   r   r   test_dictionary   s   rx   c                 C   s   d}t | j|| jd| jjd}|j}t|dksJ t|d dks$J g dg dg dg dg d	g d
g}t|D ]\}}||d |}||ksNJ q<dS )zO
    Test the results of running a sentence into the dictionary featurizer
    u   我想吃蛋白质TrT   re   r      )r   r   r   r   r   r   r   r   )re   re   r   r   r   r   r   r   )r   r   r   r   re   r   r   r   )r   r   r   r   r   re   r   r   N)	r   rZ   rV   r[   rX   r/   rg   	enumerateextract_dict_feat)r?   r   r`   r/   expected_featuresrE   rl   dict_featuresr   r   r   test_dictionary_feats   s    r~   c                  C   sL   g d} g d}| D ]}t |dusJ q
|D ]}t |du s#J qdS )za
    Test the "is numeric" function

    This function is entirely based on an RE in data.py
    )	57	135245345z12535.z852358.458345z435345...345345z111,,,111,,,111,,,1115318008u   ５u   ๕)	z.454353az
5453 35345	aaa143234za,a,a,azsh'reyanasdaf786876asdfasdf `11111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111aN)r   match)matchesnot_matchesxr   r   r   test_numeric_re   s   r   )*__doc__pytestr   numpyri   stanzar   stanza.testsstanza.models.tokenization.datar   r   marktravisr<   
pytestmarkr   r)   r*   r-   r.   r+   r0   fixturer=   r?   r\   r]   r_   rb   rn   arrayint32ro   rp   rq   rr   rs   rm   rt   rx   r~   r   r   r   r   r   <module>   sR    







"