o
    h                     @   s
  d Z ddlmZ ddlZddlZddlZddlmZ ddlm	Z	 ddl
mZmZmZ ddlmZmZ ddlm  m  mZ ddlm  m  mZ ddlmZ dd	lmZmZmZ ed
ZG dd deZdddZ dd Z!dddZ"dddZ#e$dkre#  dS dS )z
A script to prepare all depparse datasets.

Prepares each of train, dev, test.

Example:
    python -m stanza.utils.datasets.prepare_depparse_treebank {TREEBANK}
Example:
    python -m stanza.utils.datasets.prepare_depparse_treebank UD_English-EWT
    )EnumN)tagger)treebank_to_short_name)downloadDEFAULT_MODEL_DIRUnknownLanguageError)default_charlmspos_charlms)wordvec_args)add_charlm_argsbuild_charlm_argschoose_charlmstanzac                   @   s   e Zd ZdZdZdZdS )TagszTags parameter values.      N)__name__
__module____qualname____doc__GOLD	PREDICTED r   r   j/var/www/html/env_mimamsha/lib/python3.10/site-packages/stanza/utils/datasets/prepare_depparse_treebank.pyr      s    r   returnc                 C   s   | j dddtjtjdd | j dddtjdd | j d	td
dd | j dtd
dd | j dttjdddd | j dddddd t|  d
S )zAdd specific args.z--gold
tag_methodstore_constz,Use gold tags for building the depparse data)destactionconstdefaulthelpz--predictedz1Use predicted tags for building the depparse data)r   r   r   r!   --wordvec_pretrain_fileNz'Exact name of the pretrain file to read)typer    r!   z--tagger_modelzkTagger save file to use.  If not specified, order searched will be saved/models, then $STANZA_RESOURCES_DIR
--save_dirsaved_modelsposz-Where to look for recently trained POS modelsz--no_download_taggerTdownload_taggerstore_falsezDon't try to automatically download a tagger for retagging the dependencies.  Will fail to make silver tags if there is no tagger model to be found)r    r   r   r!   )	add_argumentr   r   r   strospathjoinr   )parserr   r   r   add_specific_args%   s&   

r/   c              
      s  |r|S t  tj|jd| |f }t|dkr|d S t|dkrQdD ]$  fdd|D }t|dkr<|d   S t|dkrItd|q%td||jsVdS tjt	| d	|d
 }zt
| dd	|id W |S  ty } z
td| |jf |d}~ww )zj
    Preferentially chooses a retrained tagger model, but tries to download one if that doesn't exist
    z
%s_%s_*.ptr   r   )z_trans_tagger.ptz_charlm_tagger.ptz_nocharlm_tagger.ptc                    s   g | ]	}|  r|qS r   )endswith).0xendingr   r   
<listcomp>B   s    z'choose_tagger_model.<locals>.<listcomp>zWCould not choose among the candidate taggers... please pick one with --tagger_model: {}Nr&   z.pt)langpackage
processorsa  The language %s appears to be a language new to Stanza.  Unfortunately, that means there are no taggers available for retagging the dependency dataset.  Furthermore, there are no tagger models for this language found in %s.  You can specify a different directory for already trained tagger models with --save_dir, specify an exact tagger model name with --tagger_model, or use gold tags with --gold)globr+   r,   r-   save_dirlenFileNotFoundErrorformatr'   r   r   r   )short_languagedatasettagger_modelargs
candidatesbest_candidatespos_pather   r3   r   choose_tagger_model6   s0   rF   c                    s.  |j tju rt| |||d  d	S |j tju rt| }|d\}}d|d d|d|ddg t|||j	|}|d	u r?t
d
td||| tj|\}}	 d|d|	g  |jrb d|jg7  n t||g   t|||jtt}
t||
} |   fdd}t| |||d | d	S td|j )zProcess treebank.DEPPARSE_DATA_DIR_z--wordvec_dirWORDVEC_DIRz--langz--shorthandz--modepredictNzcCannot find a tagger for language %s, dataset %s - you can specify one with the --tagger_model flagz"Using tagger model in %s for %s_%sr$   z--save_namer"   c                    sb   |  d| d| d}| d| d| d}d|d|g} | }t d||| t| d S )N/.z.conlluz--eval_filez--output_filez+Running tagger to retag {} to {}
  Args: {})loggerinfor=   r   main)tokenizer_dirtokenizer_filedest_dir	dest_file
short_nameoriginalretaggedtagger_args	base_argsr   r   retag_datasety   s   z'process_treebank.<locals>.retag_datasetzUnknown tags method: {})r   r   r   prepare_tokenizer_treebankcopy_conllu_treebankr   r   splitrF   r@   r<   rM   rN   r+   r,   wordvec_pretrain_filer
   r   charlmr   r	   r   
ValueErrorr=   )treebank
model_typepathsrA   rT   r>   r?   r@   
tagger_dirtagger_namer_   charlm_argsrZ   r   rX   r   process_treebankU   s2   
rg   c                   C   s   t tt jjt dS )zCall Process Treebank.N)commonrO   rg   	ModelTypeDEPPARSEr/   r   r   r   r   rO      s   rO   __main__)r   N)%r   enumr   r9   loggingr+   stanza.modelsr   stanza.models.common.constantr   stanza.resources.commonr   r   r   !stanza.resources.default_packagesr   r	   stanza.utils.datasets.commonutilsdatasetsrh   0stanza.utils.datasets.prepare_tokenizer_treebankr[   stanza.utils.training.run_posr
   stanza.utils.training.commonr   r   r   	getLoggerrM   r   r/   rF   rg   rO   r   r   r   r   r   <module>   s,    



4
