o
    –h>	  ã                   @   s*   d Z ddlZddlZddlZddd„ZdS )aY  
Converts the Thai NNER22 dataset to a format usable by Stanza's NER model

The dataset is already written in json format, so we will convert into a compatible json format.

The dataset in the original format has nested NER format which we will only extract the first layer
of NER tag and write it in the format accepted by current Stanza model
é    NTc              	   C   sü  |dksJ ‚d}t j | d ddddd¡}|s|d	 }|D ]Þ}t j |d
| ¡}t j | d d||f ¡}t d||f ¡ t t|ƒ¡}g }	tt	|ƒƒD ]‡}
||
 d ||
 d }}t	|ƒd}}g i }}|D ]<}|d \}}||kr¦|d  
¡ }|}t||ƒD ]"}||krŽd| }n||d kr™d| }nd| }||| f||< qƒqjt|ƒD ]#}i }||vr¿d|| |d< |d< n
|| \|d< |d< | |¡ q«|	 |¡ qMt|dƒ}tj|	|dd W d   ƒ n1 síw   Y  t d||f ¡ qd S )NÚ	th_nner22)ÚtrainÚdevÚtestÚNERBASEÚthaiz	Thai-NNERÚdatazscb-nner-th-2022ÚpostprocÚ_no_wsz%s.jsonÚNER_DATA_DIRz
%s.%s.jsonzOutput path for %s split at %sÚtokensÚentitiesr   ÚspanÚentity_typezB-é   zE-zI-ÚOÚnerÚtextÚw)Úindentz$%s.%s.json file successfully created)ÚosÚpathÚjoinÚloggingÚinfoÚjsonÚloadÚopenÚrangeÚlenÚupperÚappendÚdump)ÚpathsÚ
short_nameÚinclude_space_charÚSHARDSÚBASE_INPUT_PATHÚshardÚ
input_pathÚoutput_pathr   Ú	documentsÚiÚtokenr   Útoken_lengthÚsofarÚdocumentÚner_dictÚentityÚstartÚstopr   ÚjÚner_tagÚkÚdict_addÚoutfile© r:   úc/var/www/html/env_mimamsha/lib/python3.10/site-packages/stanza/utils/datasets/ner/convert_nner22.pyÚconvert_nner22   sP   


€ÿÐr<   )T)Ú__doc__r   r   r   r<   r:   r:   r:   r;   Ú<module>   s
    	