o
    he
                     @   s   d Z ddlmZ ddlm  m  mZ ddlm  m  mZ ddl	m  m  m
Z
 dddZdd Zd	d
 Zdd ZedkrFe  dS dS )z
A script to prepare all lemma datasets.

For example, do
  python -m stanza.utils.datasets.prepare_lemma_treebank TREEBANK
such as
  python -m stanza.utils.datasets.prepare_lemma_treebank UD_English-EWT

and it will prepare each of train, dev, test
    )treebank_to_short_nameNreturnc                 C   s   | j dddddd d S )Nz--no_lemma_classifierlemma_classifierstore_falseTzqDon't use the lemma classifier datasets.  Default is to build lemma classifier as part of the original lemmatizer)destactiondefaulthelp)add_argument)parser r   g/var/www/html/env_mimamsha/lib/python3.10/site-packages/stanza/utils/datasets/prepare_lemma_treebank.pyadd_specific_args   s   
r   c                 C   s   t | G}|D ];}| }|r|drq|d}|d   }|d   }|r4|dks4|dkr5q||kr:q W d   dS W d   d	S 1 sNw   Y  d	S )
z
    Check if a treebank has any lemmas in it

    For example, in Vietnamese-VTB, all the words and lemmas are exactly the same
    in Telugu-MTG, all the lemmas are blank
    #	      _-NTF)openstrip
startswithsplitlower)
train_filefinlinepieceswordlemmar   r   r   check_lemmas   s&   


r    c                 C   s   |  dr |d }tj| |dddd}t|}|std|  nd}tj| |||d |d	 t| }|jrA|t	j
v rCt	| d S d S d S )
NUD_UDBASEtrainconlluT)failz=No lemma information found in %s.  Not augmenting the datasetLEMMA_DATA_DIR)augment)r   commonfind_treebank_dataset_filer    printprepare_tokenizer_treebankcopy_conllu_treebankr   r   prepare_lemma_classifierDATASET_MAPPINGmain)treebank
model_typepathsargs
udbase_dirtrain_conllur'   
short_namer   r   r   process_treebank1   s   
r7   c                   C   s   t tt jjt d S )N)r(   r/   r7   	ModelTypeLEMMAr   r   r   r   r   r/   A   s   r/   __main__)r   N)__doc__stanza.models.common.constantr   stanza.utils.datasets.commonutilsdatasetsr(   0stanza.utils.datasets.prepare_tokenizer_treebankr+   .stanza.utils.datasets.prepare_lemma_classifierr-   r   r    r7   r/   __name__r   r   r   r   <module>   s    

