o
    –hõ  ã                   @   s&   d Z ddlZddlmZ ddd„ZdS )zµ
Converts the Thai LST20 dataset to a format usable by Stanza's NER model

The dataset in the original format has a few tag errors which we
automatically fix (or at worst cover up)
é    N)Úconvert_bio_to_jsonTc                    sv  |dksJ ‚d}ˆ d }‡ fdd„|D ƒ}|s|d }|D ]\}}dd„ t  |¡D ƒ}|dkr2d	}t j |d
||f ¡}	t|	ƒ t|	dddá}
|D ]Ö}g }tt j ||¡ddd}| ¡ }W d   ƒ n1 sjw   Y  t|ƒD ]­\}}| ¡  	d¡}t
|ƒdkr|d dkrŽ|sŽqs|d |d }}|dkrd}|dkr£d}|dkr©d}|dkr¯d}|dkrµd}|dkrì|d t
|ƒk rì||d   ¡  	d¡}t
|ƒdkrê|d }d|v sÞd |v rç||dd …  }nd!}nd!}d|v rö| dd"¡}d#|v s
|d$ks
|d%ks
|d&krd!}|
 d' ||¡¡ |
 d(¡ qs|
 d(¡ qsqKW d   ƒ n	1 s-w   Y  qt|||ƒ d S ))NÚth_lst20)ÚtrainÚevalÚtestÚNER_DATA_DIRc                    s&   g | ]}t j ˆ d  dd|¡|f‘qS )ÚNERBASEÚthaiÚLST20_Corpus)ÚosÚpathÚjoin)Ú.0Úx©Úpaths© úb/var/www/html/env_mimamsha/lib/python3.10/site-packages/stanza/utils/datasets/ner/convert_lst20.pyÚ
<listcomp>   s   & z!convert_lst20.<locals>.<listcomp>Ú_no_wsc                 S   s   g | ]
}|d  dkr|‘qS )r   ÚTr   )r   Útextr   r   r   r      s    r   Údevz	%s.%s.bioÚwzutf-8)ÚencodingÚrú	é   r   Ú_é   ÚMEA_BIÚB_MEAÚOBRN_BÚB_BRNÚORG_IÚI_ORGÚPER_IÚI_PERÚLOC_IÚI_LOCÚBÚI_ÚE_ÚOú-ÚABBÚDDEMÚIÚ__z{}	{}Ú
)r   Úlistdirr   r   ÚprintÚopenÚ	readlinesÚ	enumerateÚstripÚsplitÚlenÚreplaceÚwriteÚformatr   )r   Ú
short_nameÚinclude_space_charÚSHARDSÚBASE_OUTPUT_PATHÚinput_splitÚinput_folderÚ
split_typeÚ	text_listÚoutput_pathÚfoutr   ÚlstÚfinÚlinesÚline_idxÚliner   ÚwordÚtagÚx_nextÚtag_nextr   r   r   Úconvert_lst20   sl   
ÿ(Üûÿ€+rR   )T)Ú__doc__r   Ústanza.utils.datasets.ner.utilsr   rR   r   r   r   r   Ú<module>   s    