o
    h
                     @   sZ   d Z ddlZddlmZ ddlmZ g dZdd Zdd	 Zd
d Z	e
dkr+e	  dS dS )z
Downloads (if necessary) conll03 from Huggingface, then converts it to Stanza .json

Some online sources for CoNLL 2003 require multiple pieces, but it is currently hosted on HF:
https://huggingface.co/datasets/conll2003
    N)get_default_paths)write_dataset)%OzB-PERSONzI-PERSONzB-NORPzI-NORPzB-FACzI-FACzB-ORGzI-ORGzB-GPEzI-GPEzB-LOCzI-LOCz	B-PRODUCTz	I-PRODUCTzB-DATEzI-DATEzB-TIMEzI-TIMEz	B-PERCENTz	I-PERCENTzB-MONEYzI-MONEYz
B-QUANTITYz
I-QUANTITYz	B-ORDINALz	I-ORDINALz
B-CARDINALz
I-CARDINALzB-EVENTzI-EVENTzB-WORK_OF_ARTzI-WORK_OF_ARTzB-LAWzI-LAWz
B-LANGUAGEz
I-LANGUAGEc                 C   sh   g }|D ]-}|  dr|d  drq|d D ]}|d }dd |d D }|tt|| qq|S )	Nenglishdocument_idzpt/nt	sentenceswordsc                 S   s   g | ]}t | qS  )	ID_TO_TAG.0xr	   r	   f/var/www/html/env_mimamsha/lib/python3.10/site-packages/stanza/utils/datasets/ner/convert_ontonotes.py
<listcomp>   s    z+convert_dataset_section.<locals>.<listcomp>named_entities)
startswithappendlistzip)config_namesectionr   docsentencer   tagsr	   r	   r   convert_dataset_section   s   r   c              
      s   zddl m} W n ty } ztdd }~ww | dkr d n| dv r'd n| dkr.d	 ntd
|  |d |d} fdd|d |d |d fD }t|||  d S )Nr   )load_datasetzBPlease install the datasets package to process CoNLL03 with Stanzaen_ontonotes
english_v4)zh_ontonoteszzh-hans_ontonotes
chinese_v4ar_ontonotes	arabic_v4z0Unknown short name for downloading ontonotes: %sconll2012_ontonotesv5)	cache_dirc                    s   g | ]}t  |qS r	   )r   r   r   r	   r   r   /   s    z#process_dataset.<locals>.<listcomp>train
validationtest)datasetsr   ImportError
ValueErrorr   )
short_name
conll_pathner_output_pathr   edatasetr(   r	   r$   r   process_dataset   s    $r0   c                  C   s6   t  } | d }tj|dd}| d }td|| d S )NNERBASEr   r   NER_DATA_DIR)r   ospathjoinr0   )pathsner_input_pathr,   r-   r	   r	   r   main2   s
   r8   __main__)__doc__r3   stanza.utils.default_pathsr   stanza.utils.datasets.ner.utilsr   r
   r   r0   r8   __name__r	   r	   r	   r   <module>   s    
