o
    h                     @   s   d Z ddlZddlZddlZddlmZ ddlmZ ddlm	Z	 ddl
mZ edZdd	 Zd
d Zdd Zedkr@e  dS dS )a  
This script allows for training or testing on dev / test of the UD tokenizer.

If run with a single treebank name, it will train or test that treebank.
If run with ud_all or all_ud, it will iterate over all UD treebanks it can find.

Mode can be set to train&dev with --train, to dev set only
with --score_dev, and to test set only with --score_test.

Treebanks are specified as a list.  all_ud or ud_all means to look for
all UD treebanks.

Extra arguments are passed to tokenizer.  In case the run script
itself is shadowing arguments, you can specify --extra_args as a
parameter to mark where the tokenizer arguments start.

Default behavior is to discard the output and just print the results.
To keep the results instead, use --save_output
    N)	tokenizer)avg_sent_len)common)Modestanzac                 C   s   | dv rdS dS )z
    Some of the languages (as shown here) have external dictionaries

    We found this helped the overall tokenizer performance
    If these can't be found, they can be extracted from the previous iteration of models
    )jathzhzzh-hanszzh-hantTF )short_languager
   r
   ^/var/www/html/env_mimamsha/lib/python3.10/site-packages/stanza/utils/training/run_tokenizer.pyuses_dictionary    s   r   c                  C   s   |d }| dd }d}	| d| d}
d}| d| d}d}| d| d	}d}| d| d
}d|d| d| dg}|dksH|drMdg| }| d| d}| d| d}| d| d}| d| d}| d| d}| d| d}|r|n| d| d}|r|n| d| d}|r|n| d| d}| tjkrttt|
d d d }|	|
||d|d|d|g
| d|d |d!|g }t|r|d"g }|| }t	
d#| t| | tjks| tjkrd$d%||d|d |d!|d|g}|| }t	
d&| t| t||}t	
d'|| | tjks*| tjkrXd$d%||d|d |d!|d|g}|| }t	
d(| t| t||}t	
d)|| | tjkrd$d%||d|d |d!|d|g}|| }t	
d(| t| t||}t	
d*|| d S d S )+NTOKENIZE_DATA_DIR_r   z--label_file/z-ud-train.toklabelsz
--txt_filez.dev.txtz	.test.txtz
.train.txtz--dev_txt_filez--dev_label_filez-ud-dev.toklabelsr	   zzh-z--skip_newlinez.train.gold.conlluz.dev.gold.conlluz.test.gold.conlluz-ud-train-mwt.jsonz-ud-dev-mwt.jsonz-ud-test-mwt.jsonz.train.pred.conlluz.dev.pred.conlluz.test.pred.conllu   d   z--langz--max_seqlenz--mwt_json_filez--dev_conll_goldz--conll_filez--shorthandz--use_dictionaryz Running train step with args: {}z--modepredictzRunning dev step with args: {}z!Finished running dev set on
{}
{}zRunning test step with args: {}z"Finished running test set on
{}
{}z-Finished running train set as a test on
{}
{})split
startswithr   TRAINstrmathceilr   r   loggerinfoformatr   main	SCORE_DEVr   run_eval_script_tokens
SCORE_TESTSCORE_TRAIN) modepathstreebank
short_nametemp_output_filecommand_args
extra_argstokenize_dirr   
label_type
label_filedev_typedev_file	test_type	test_file
train_type
train_filetrain_dev_args
train_golddev_gold	test_gold	train_mwtdev_mwttest_mwt
train_preddev_pred	test_predseqlen
train_argsdev_argsresults	test_argsr
   r
   r   run_treebank+   s   






rA   c                   C   s   t jtddt d d S )Ntokenizer   )sub_argparse)r   r   rA   r   build_argparser
   r
   r
   r   r   x   s   r   __main__)__doc__loggingr   osstanza.modelsr   stanza.utils.avg_sent_lenr   stanza.utils.trainingr   stanza.utils.training.commonr   	getLoggerr   r   rA   r   __name__r
   r
   r
   r   <module>   s    
M
