o
    h	%  ã                   @   sî   d dl Z d dlZd dlZd dlZd dlT d dlmZ d dlmZ d dl	m
Z
 ejjejjgZd Ą Zd Ą ZdZd	Zd
d Zdd Zd Ą ZdZdZdd Zd Ą ZdZdZdd Zedkrue Ą  dĄZ ee dddZ!e
ddde! dS dS )é    N)Ú*)Úconvert_conllu_to_txt)Úread_document)Úwrite_sectionuî  
āļŠāļļāļĢāļĒāļļāļāļāđ	NN	B_PER	B_CLS
āļĒāļąāļ	VV	O	I_CLS
āļāļāļīāđāļŠāļ	VV	O	I_CLS
āļĨāļāļāļēāļĄ	VV	O	I_CLS
_	PU	O	I_CLS
MOU	NN	O	I_CLS
_	PU	O	I_CLS
āļāļąāļ	PS	O	I_CLS
āļ­āļĩāļĒāļđ	NN	B_ORG	I_CLS
āđāļĄāđ	NG	O	I_CLS
āļāļĢāļ°āļāļ	VV	O	I_CLS
āļŠāļąāļĄāļāļąāļāļāđ	NN	O	E_CLS

1	NU	B_DTM	B_CLS
_	PU	I_DTM	I_CLS
āļāļąāļāļĒāļēāļĒāļ	NN	I_DTM	I_CLS
_	PU	I_DTM	I_CLS
2550	NU	E_DTM	I_CLS
_	PU	O	I_CLS
12:21	NU	B_DTM	I_CLS
_	PU	I_DTM	I_CLS
āļ.	CL	E_DTM	E_CLS

āļāļđāđāļŠāļ·āđāļ­āļāđāļēāļ§	NN	O	B_CLS
āļĢāļēāļĒāļāļēāļ	VV	O	I_CLS
āđāļāļīāđāļĄāđāļāļīāļĄ	VV	O	I_CLS
āļ§āđāļē	CC	O	E_CLS
_	PU	O	O
āļāļēāļ	PS	O	B_CLS
āļāļēāļĢ	FX	O	I_CLS
āļĨāļ	VV	O	I_CLS
āļāļ·āđāļāļāļĩāđ	NN	O	I_CLS
āļāļ	VV	O	I_CLS
āļ§āđāļē	CC	O	E_CLS
u  
1	āļŠāļļāļĢāļĒāļļāļāļāđ	_	_	_	_	0	root	0:root	SpaceAfter=No|NewPar=Yes
2	āļĒāļąāļ	_	_	_	_	1	dep	1:dep	SpaceAfter=No
3	āļāļāļīāđāļŠāļ	_	_	_	_	2	dep	2:dep	SpaceAfter=No
4	āļĨāļāļāļēāļĄ	_	_	_	_	3	dep	3:dep	_
5	MOU	_	_	_	_	4	dep	4:dep	_
6	āļāļąāļ	_	_	_	_	5	dep	5:dep	SpaceAfter=No
7	āļ­āļĩāļĒāļđ	_	_	_	_	6	dep	6:dep	SpaceAfter=No
8	āđāļĄāđ	_	_	_	_	7	dep	7:dep	SpaceAfter=No
9	āļāļĢāļ°āļāļ	_	_	_	_	8	dep	8:dep	SpaceAfter=No
10	āļŠāļąāļĄāļāļąāļāļāđ	_	_	_	_	9	dep	9:dep	SpaceAfter=No

1	1	_	_	_	_	0	root	0:root	_
2	āļāļąāļāļĒāļēāļĒāļ	_	_	_	_	1	dep	1:dep	_
3	2550	_	_	_	_	2	dep	2:dep	_
4	12:21	_	_	_	_	3	dep	3:dep	_
5	āļ.	_	_	_	_	4	dep	4:dep	SpaceAfter=No

1	āļāļđāđāļŠāļ·āđāļ­āļāđāļēāļ§	_	_	_	_	0	root	0:root	SpaceAfter=No
2	āļĢāļēāļĒāļāļēāļ	_	_	_	_	1	dep	1:dep	SpaceAfter=No
3	āđāļāļīāđāļĄāđāļāļīāļĄ	_	_	_	_	2	dep	2:dep	SpaceAfter=No
4	āļ§āđāļē	_	_	_	_	3	dep	3:dep	_
5	āļāļēāļ	_	_	_	_	4	dep	4:dep	SpaceAfter=No
6	āļāļēāļĢ	_	_	_	_	5	dep	5:dep	SpaceAfter=No
7	āļĨāļ	_	_	_	_	6	dep	6:dep	SpaceAfter=No
8	āļāļ·āđāļāļāļĩāđ	_	_	_	_	7	dep	7:dep	SpaceAfter=No
9	āļāļ	_	_	_	_	8	dep	8:dep	SpaceAfter=No
10	āļ§āđāļē	_	_	_	_	9	dep	9:dep	SpaceAfter=No
uI  āļŠāļļāļĢāļĒāļļāļāļāđāļĒāļąāļāļāļāļīāđāļŠāļāļĨāļāļāļēāļĄ MOU āļāļąāļāļ­āļĩāļĒāļđāđāļĄāđāļāļĢāļ°āļāļāļŠāļąāļĄāļāļąāļāļāđ1 āļāļąāļāļĒāļēāļĒāļ 2550 12:21 āļ.āļāļđāđāļŠāļ·āđāļ­āļāđāļēāļ§āļĢāļēāļĒāļāļēāļāđāļāļīāđāļĄāđāļāļīāļĄāļ§āđāļē āļāļēāļāļāļēāļĢāļĨāļāļāļ·āđāļāļāļĩāđāļāļāļ§āđāļē

z}000000010010000010000100010001000100100001000000021000000010000100000100200000000001000001000000001001000100101000000101002

c           	   	   C   s&  t  Ą }t|dd|  ttj |dĄ}| Ą  Ą }W d    n1 s&w   Y  ttj |dĄ}| Ą }W d    n1 sCw   Y  ttj |dĄ}| Ą }W d    n1 s`w   Y  ||kskJ ||ksqJ ||kswJ t	|t	|ksJ W d    d S 1 sw   Y  d S )NÚlst20Útrainzth_lst20.train.gold.conlluzth_lst20.train.txtzth_lst20-ud-train.toklabels)
ÚtempfileÚTemporaryDirectoryr   ÚopenÚosÚpathÚjoinÚreadÚstripÚlen)	Ú	documentsÚexpected_conlluÚexpected_txtÚexpected_labelsÚ
output_dirÚfinÚconlluÚtxtÚlabelsĐ r   úl/var/www/html/env_mimamsha/lib/python3.10/site-packages/stanza/tests/tokenization/test_tokenization_lst20.pyÚcheck_resultsx   s    
ĸ
ĸ
ĸ"ôr   c                  C   ó.   t  Ą  dĄ} t| ddd}t|ttt dS )zë
    A small test just to verify that the output is being produced as we want

    Note that there currently are no spaces after the first sentence.
    Apparently this is wrong, but weirdly, doing that makes the model even worse.
    Ú
FĐÚspaces_afterÚsplit_clausesN)ÚSMALL_LST_SAMPLEr   Úsplitr   r   ÚEXPECTED_CONLLUÚEXPECTED_TXTÚEXPECTED_LABELSĐÚlinesr   r   r   r   Ú
test_small   s   r)   u{  
1	āļŠāļļāļĢāļĒāļļāļāļāđ	_	_	_	_	0	root	0:root	SpaceAfter=No|NewPar=Yes
2	āļĒāļąāļ	_	_	_	_	1	dep	1:dep	SpaceAfter=No
3	āļāļāļīāđāļŠāļ	_	_	_	_	2	dep	2:dep	SpaceAfter=No
4	āļĨāļāļāļēāļĄ	_	_	_	_	3	dep	3:dep	_
5	MOU	_	_	_	_	4	dep	4:dep	_
6	āļāļąāļ	_	_	_	_	5	dep	5:dep	SpaceAfter=No
7	āļ­āļĩāļĒāļđ	_	_	_	_	6	dep	6:dep	SpaceAfter=No
8	āđāļĄāđ	_	_	_	_	7	dep	7:dep	SpaceAfter=No
9	āļāļĢāļ°āļāļ	_	_	_	_	8	dep	8:dep	SpaceAfter=No
10	āļŠāļąāļĄāļāļąāļāļāđ	_	_	_	_	9	dep	9:dep	_

1	1	_	_	_	_	0	root	0:root	_
2	āļāļąāļāļĒāļēāļĒāļ	_	_	_	_	1	dep	1:dep	_
3	2550	_	_	_	_	2	dep	2:dep	_
4	12:21	_	_	_	_	3	dep	3:dep	_
5	āļ.	_	_	_	_	4	dep	4:dep	_

1	āļāļđāđāļŠāļ·āđāļ­āļāđāļēāļ§	_	_	_	_	0	root	0:root	SpaceAfter=No
2	āļĢāļēāļĒāļāļēāļ	_	_	_	_	1	dep	1:dep	SpaceAfter=No
3	āđāļāļīāđāļĄāđāļāļīāļĄ	_	_	_	_	2	dep	2:dep	SpaceAfter=No
4	āļ§āđāļē	_	_	_	_	3	dep	3:dep	_
5	āļāļēāļ	_	_	_	_	4	dep	4:dep	SpaceAfter=No
6	āļāļēāļĢ	_	_	_	_	5	dep	5:dep	SpaceAfter=No
7	āļĨāļ	_	_	_	_	6	dep	6:dep	SpaceAfter=No
8	āļāļ·āđāļāļāļĩāđ	_	_	_	_	7	dep	7:dep	SpaceAfter=No
9	āļāļ	_	_	_	_	8	dep	8:dep	SpaceAfter=No
10	āļ§āđāļē	_	_	_	_	9	dep	9:dep	_
uK  āļŠāļļāļĢāļĒāļļāļāļāđāļĒāļąāļāļāļāļīāđāļŠāļāļĨāļāļāļēāļĄ MOU āļāļąāļāļ­āļĩāļĒāļđāđāļĄāđāļāļĢāļ°āļāļāļŠāļąāļĄāļāļąāļāļāđ 1 āļāļąāļāļĒāļēāļĒāļ 2550 12:21 āļ. āļāļđāđāļŠāļ·āđāļ­āļāđāļēāļ§āļĢāļēāļĒāļāļēāļāđāļāļīāđāļĄāđāļāļīāļĄāļ§āđāļē āļāļēāļāļāļēāļĢāļĨāļāļāļ·āđāļāļāļĩāđāļāļāļ§āđāļē

z00000001001000001000010001000100010010000100000002010000000100001000001002000000000001000001000000001001000100101000000101002

c                  C   s.   t  Ą  dĄ} t| ddd}t|ttt dS )zA
    This version of the test adds the space after attribute
    r   TFr   N)r"   r   r#   r   r   ÚEXPECTED_SPACE_CONLLUÚEXPECTED_SPACE_TXTÚEXPECTED_SPACE_LABELSr'   r   r   r   Útest_space_afterī   ó   r-   u}  
1	āļŠāļļāļĢāļĒāļļāļāļāđ	_	_	_	_	0	root	0:root	SpaceAfter=No|NewPar=Yes
2	āļĒāļąāļ	_	_	_	_	1	dep	1:dep	SpaceAfter=No
3	āļāļāļīāđāļŠāļ	_	_	_	_	2	dep	2:dep	SpaceAfter=No
4	āļĨāļāļāļēāļĄ	_	_	_	_	3	dep	3:dep	_
5	MOU	_	_	_	_	4	dep	4:dep	_
6	āļāļąāļ	_	_	_	_	5	dep	5:dep	SpaceAfter=No
7	āļ­āļĩāļĒāļđ	_	_	_	_	6	dep	6:dep	SpaceAfter=No
8	āđāļĄāđ	_	_	_	_	7	dep	7:dep	SpaceAfter=No
9	āļāļĢāļ°āļāļ	_	_	_	_	8	dep	8:dep	SpaceAfter=No
10	āļŠāļąāļĄāļāļąāļāļāđ	_	_	_	_	9	dep	9:dep	_

1	1	_	_	_	_	0	root	0:root	_
2	āļāļąāļāļĒāļēāļĒāļ	_	_	_	_	1	dep	1:dep	_
3	2550	_	_	_	_	2	dep	2:dep	_
4	12:21	_	_	_	_	3	dep	3:dep	_
5	āļ.	_	_	_	_	4	dep	4:dep	_

1	āļāļđāđāļŠāļ·āđāļ­āļāđāļēāļ§	_	_	_	_	0	root	0:root	SpaceAfter=No
2	āļĢāļēāļĒāļāļēāļ	_	_	_	_	1	dep	1:dep	SpaceAfter=No
3	āđāļāļīāđāļĄāđāļāļīāļĄ	_	_	_	_	2	dep	2:dep	SpaceAfter=No
4	āļ§āđāļē	_	_	_	_	3	dep	3:dep	_

1	āļāļēāļ	_	_	_	_	0	root	0:root	SpaceAfter=No
2	āļāļēāļĢ	_	_	_	_	1	dep	1:dep	SpaceAfter=No
3	āļĨāļ	_	_	_	_	2	dep	2:dep	SpaceAfter=No
4	āļāļ·āđāļāļāļĩāđ	_	_	_	_	3	dep	3:dep	SpaceAfter=No
5	āļāļ	_	_	_	_	4	dep	4:dep	SpaceAfter=No
6	āļ§āđāļē	_	_	_	_	5	dep	5:dep	_
z00000001001000001000010001000100010010000100000002010000000100001000001002000000000001000001000000001002000100101000000101002

c                  C   r   )zJ
    This version of the test also resplits on spaces between clauses
    r   Tr   N)r"   r   r#   r   r   ÚEXPECTED_CLAUSE_CONLLUÚEXPECTED_CLAUSE_TXTÚEXPECTED_CLAUSE_LABELSr'   r   r   r   Útest_split_clauseā   r.   r2   Ú__main__r   Fr   Úfoor   r   )"r   r   ÚpytestÚstanzaÚstanza.testsÚstanza.utils.datasets.commonr   Ú3stanza.utils.datasets.tokenization.convert_th_lst20r   Ú<stanza.utils.datasets.tokenization.process_thai_tokenizationr   ÚmarkÚtravisÚpipelineÚ
pytestmarkr   r"   r$   r%   r&   r   r)   r*   r+   r,   r-   r/   r0   r1   r2   Ú__name__r#   r(   r   r   r   r   r   Ú<module>   sH    #Ý%ä:	ä	ãü