o
    h+                     @   s   d Z ddlZddlZddlmZ ddlT ddlmZ ddlmZ ddlm	Z	 ej
jej
jgZdd	 Zd
d Zdd Zdd Zdd Zdd Zdd Zd Zdd ZdS )z~
Very simple test of the sentence slicing by <PAD> tags

TODO: could add a bunch more simple tests for the tokenization utils
    N)Pipeline)*)doc)data)utilsc                  C   s   g d} t | dgksJ g d} t | dgksJ g d} t | dgks*J g d} t | dgks8J g d} t | dd	gksGJ d
S )z0
    Test various raw -> span manipulations
    )unbar    moxr   r   pr
   l)r      )r   r   r	   r
   r   r   r   r   r   r   r   r   r
   r   <PAD>)r   r   r   r	   r
   r   r   r   r   r   r   r   r   r
   r   r   )      )r   r   r   r	   r
   r   r   r   r   r   r   r   r   r
   r   )r   r   r   r	   r
   r   r   r   r   r   r   r   r   r
   r   )r      )   r   N)r   
find_spans)raw r   h/var/www/html/env_mimamsha/lib/python3.10/site-packages/stanza/tests/tokenization/test_tokenize_utils.pytest_find_spans   s   r   c                 C   s   t | jt |ksJ t| j|D ],\}}t |jt |ks J t|j|D ]\}}|j|d ks3J |j|d ks<J q&qdS )zm
    Compare the start_char and end_char of the tokens in the doc with the given list of list of offsets
    r   r   N)len	sentencesziptokens
start_charend_char)r   expected_offsetssentenceoffsetstokenoffsetr   r   r   check_offsets%   s   r'   c                  C   s  t g dgd} g dg}t| | t g dg dgd} g dg dg}t| | tt t g dgd} W d   n1 sEw   Y  tt t g dgd	} W d   n1 scw   Y  tt t g d
gd} W d   dS 1 sw   Y  dS )z>
    Test the conversion of pretokenized text to Document
    )Thisisr
   testThisisatest))r      )r,   r   )r   r   )r      )unbanmoxopal!zThisisatest  unban mox  opal!))      )      )      )r7      ThisisatesttttN
Thisisates)r(   izr
   r*   )r   match_tokens_with_textr'   pytestraises
ValueError)r   r"   r   r   r   test_match_tokens_with_text0   s"   


"r@   c                  C   s
  t dtdd} | jd }dd }tj|j||jd|jjd}d|_	t
t  tjd|j||jdd	||jd
dd\}}}}W d   n1 sIw   Y  tj|j||jd|jjd}tjd|j||jdd	||jd
dd\}}}}t||}t|jdksJ dS )zL
    Test the tokenizer's capacity to break text up into smaller chunks
    entokenize)dir
processorszATIL not to ask a date to dress up as Smurfette on a first date.  d   T)
input_textvocab
evaluation
dictionaryNi  	no_ssplitF)	orig_textrJ   )r   TEST_MODELS_DIRrD   r   
DataLoaderconfigrG   trainerrI   advance_old_batchr=   r>   	TypeErrorr   output_predictionsgetr   Documentr   r   )pipeline	tokenizerraw_textbatches_documentr   r   r   test_long_paragraphF   s$   
r[   c               
      s   g dg dg d} ddddddd	dd
ddddddd
dddddddddddddddgdddddddddddd d!d"ddd
dd"d#ddgg} fd$d%}t ||| }||ks`J d&S )'z
    Check that the postprocessor behaves correctly by applying the identity postprocessor and hoping that it does indeed return correctly.
    IamJoe.	   ⭆⊱⇞Hi.I'mr
   chickenrb   &   I am Joe. ⭆⊱⇞ Hi. I'm a chicken.r   r]   r   idtextr    r!      r^   r,      r_      	   r`   
   r2   ra   r      SpaceAfter=Norh   ri   r    r!   miscr   rb      rd   r3      r
   r5      re   r6          c                    s    S Nr   )rY   good_tokenizationr   r   postprocesork   s   z4test_postprocessor_application.<locals>.postprocesorN)r   postprocess_doc)ri   
target_docr{   resr   ry   r   test_postprocessor_applicationa   s   r   c               
   C   s   g dg dg} dd | D }dd | D }d}ddd	dd
ddddd
ddddd
ddddd
ddddddddddd
gddddd
dd d!d"d
dd#d$d%ddddd%d&ddgg}t | |||}||ksiJ d'S )(z`
    Check that the reassembly code counts the indicies correctly, and including OOV chars.
    r\   rc   c                 S   "   g | ]}d d t t|D qS )c                 S      g | ]}d qS Fr   .0rY   r   r   r   
<listcomp>x       7test_reassembly_indexing.<locals>.<listcomp>.<listcomp>ranger   r   ir   r   r   r   x      " z,test_reassembly_indexing.<locals>.<listcomp>c                 S   r   )c                 S   r   rx   r   r   r   r   r   r   y   r   r   r   r   r   r   r   r   y   r   rf   r   r]   r   rg   rj   r^   r,   rk   r_   rl   rm   r`   rn   r2   ra   r   ro   rp   rq   r   rb   rs   rd   r3   rt   r
   r5   ru   re   r6   rv   rw   N)r   reassemble_doc_from_tokens)rz   	good_mwtsgood_expansionsri   r}   r~   r   r   r   test_reassembly_indexingr   s   r   c            
      C   sR  g dg} dd t t| d D g}dd t t| d D g}g dg}dd t t|d D g}dd t t|d D g}g d	g}d
d t t|d D g}dd t t|d D g}d}	tt t| |||	 W d   n1 s}w   Y  tt t||||	 W d   n1 sw   Y  t||||	 dS )zi
    Check that the reassembly code complains correctly when the user adds tokens that doesn't exist
    )JoeSmithlivesinSouthern
Californiarb   c                 S   r   r   r   r   r   r   r   r      r   z6test_reassembly_reference_failures.<locals>.<listcomp>r   c                 S   r   rx   r   r   r   r   r   r      r   )r   r   r   r   Californiaarb   c                 S   r   r   r   r   r   r   r   r      r   c                 S   r   rx   r   r   r   r   r   r      r   )r   r   r   r   r   rb   c                 S   r   r   r   r   r   r   r   r      r   c                 S   r   rx   r   r   r   r   r   r      r   zJoe Smith lives in California.N)r   r   r=   r>   r?   r   r   )
bad_addition_tokenizationbad_addition_mwtsbad_addition_expansionsbad_inline_tokenizationbad_inline_mwtsbad_inline_expansionsrz   r   r   ri   r   r   r   "test_reassembly_reference_failures   s"   


r   a	  
# sent_id = weblog-juancole.com_juancole_20051126063000_ENG_20051126_063000-0003
# text = DPA: Iraqi authorities announced that they'd busted up three terrorist cells operating in Baghdad.
1	DPA	DPA	PROPN	NNP	Number=Sing	0	root	0:root	SpaceAfter=No
2	:	:	PUNCT	:	_	1	punct	1:punct	_
3	Iraqi	Iraqi	ADJ	JJ	Degree=Pos	4	amod	4:amod	_
4	authorities	authority	NOUN	NNS	Number=Plur	5	nsubj	5:nsubj	_
5	announced	announce	VERB	VBD	Mood=Ind|Number=Plur|Person=3|Tense=Past|VerbForm=Fin	1	parataxis	1:parataxis	_
6	that	that	SCONJ	IN	_	9	mark	9:mark	_
7-8	they'd	_	_	_	_	_	_	_	_
7	they	they	PRON	PRP	Case=Nom|Number=Plur|Person=3|PronType=Prs	9	nsubj	9:nsubj	_
8	'd	have	AUX	VBD	Mood=Ind|Number=Plur|Person=3|Tense=Past|VerbForm=Fin	9	aux	9:aux	_
9	busted	bust	VERB	VBN	Tense=Past|VerbForm=Part	5	ccomp	5:ccomp	_
10	up	up	ADP	RP	_	9	compound:prt	9:compound:prt	_
11	three	three	NUM	CD	NumForm=Digit|NumType=Card	13	nummod	13:nummod	_
12	terrorist	terrorist	ADJ	JJ	Degree=Pos	13	amod	13:amod	_
13	cells	cell	NOUN	NNS	Number=Plur	9	obj	9:obj	_
14	operating	operate	VERB	VBG	VerbForm=Ger	13	acl	13:acl	_
15	in	in	ADP	IN	_	16	case	16:case	_
16	Baghdad	Baghdad	PROPN	NNP	Number=Sing	14	obl	14:obl:in	SpaceAfter=No
17	.	.	PUNCT	.	_	1	punct	1:punct	_

# sent_id = weblog-juancole.com_juancole_20051126063000_ENG_20051126_063000-0004
# text = Two of them were being run by 2 officials of the Ministry of the Interior!
1	Two	two	NUM	CD	NumForm=Word|NumType=Card	6	nsubj:pass	6:nsubj:pass	_
2	of	of	ADP	IN	_	3	case	3:case	_
3	them	they	PRON	PRP	Case=Acc|Number=Plur|Person=3|PronType=Prs	1	nmod	1:nmod:of	_
4	were	be	AUX	VBD	Mood=Ind|Number=Plur|Person=3|Tense=Past|VerbForm=Fin	6	aux	6:aux	_
5	being	be	AUX	VBG	VerbForm=Ger	6	aux:pass	6:aux:pass	_
6	run	run	VERB	VBN	Tense=Past|VerbForm=Part|Voice=Pass	0	root	0:root	_
7	by	by	ADP	IN	_	9	case	9:case	_
8	2	2	NUM	CD	NumForm=Digit|NumType=Card	9	nummod	9:nummod	_
9	officials	official	NOUN	NNS	Number=Plur	6	obl	6:obl:by	_
10	of	of	ADP	IN	_	12	case	12:case	_
11	the	the	DET	DT	Definite=Def|PronType=Art	12	det	12:det	_
12	Ministry	Ministry	PROPN	NNP	Number=Sing	9	nmod	9:nmod:of	_
13	of	of	ADP	IN	_	15	case	15:case	_
14	the	the	DET	DT	Definite=Def|PronType=Art	15	det	15:det	_
15	Interior	Interior	PROPN	NNP	Number=Sing	12	nmod	12:nmod:of	SpaceAfter=No
16	!	!	PUNCT	.	_	6	punct	6:punct	_

c                 C   s   t | d }t|ddd}|t W d   n1 sw   Y  td|\}}t|}g d}||ks8J |tdd	 |D ksEJ dS )
z
    Test a couple aspects of building a lexicon from training data

    expected number of words eliminated for being too long
    duplicate words counted once
    numbers eliminated
    ztrain.conlluwzutf-8)encodingNen_test)z'd	announcedbaghdadbeingbustedbycellsdpar   interioriraqiministryof	officials	operatingrun	terroristthatthethemtheyzthey'dthreetwoupwerec                 s   s    | ]}t |V  qd S rx   )r   )r   r   r   r   r   	<genexpr>   s    z2test_lexicon_from_training_data.<locals>.<genexpr>)stropenwrite
TRAIN_DATAr   create_lexiconsortedmax)tmp_pathconllu_filefoutlexiconnum_dict_featexpected_lexiconr   r   r   test_lexicon_from_training_data   s   r   )__doc__r=   stanzar   stanza.testsstanza.models.commonr   stanza.models.tokenizationr   r   marktravisrU   
pytestmarkr   r'   r@   r[   r   r   r   lstripr   r   r   r   r   r   <module>   s(    )+