o
    h                     @   s   d Z ddlZddlZddlmZ ddlmZ ddlmZ ddl	m
Z
 ddlmZmZmZmZmZ ddlmZmZmZmZ d	d
gd	d
gg ddZedZdd ZdddefddZdd Zdd Zdd Zedkroe  dS dS )a"  
Trains or scores an NER model.

Will attempt to guess the appropriate word vector file if none is
specified, and will use the charlms specified in the resources
for a given dataset or language if possible.

Example command line:
  python3 -m stanza.utils.training.run_ner.py hu_combined

This script expects the prepared data to be in
  data/ner/{lang}_{dataset}.train.json, {lang}_{dataset}.dev.json, {lang}_{dataset}.test.json

If those files don't exist, it will make an attempt to rebuild them
using the prepare_ner_dataset script.  However, this will fail if the
data is not already downloaded.  More information on where to find
most of the datasets online is in that script.  Some of the datasets
have licenses which must be agreed to, so no attempt is made to
automatically download the data.
    N)
ner_tagger)DEFAULT_MODEL_DIR)prepare_ner_dataset)common)Modeadd_charlm_argsbuild_charlm_argschoose_charlmfind_wordvec_pretrain)default_charlmsdefault_pretrainsner_charlmsner_pretrains	--dropout0.6)r   r   z--word_dropout0.1z--locked_dropoutr   z--char_dropoutr   )da_ddtfa_armanvi_vlspstanzac                 C   s   t |  | jddddd d S )Nz
--use_bertF
store_truez-Use the default transformer for this language)defaultactionhelp)r   add_argument)parser r   X/var/www/html/env_mimamsha/lib/python3.10/site-packages/stanza/utils/training/run_ner.pyadd_ner_args-   s   r   r   c           
      C   sj   t | ||tt}t| ||d}g }|du sd|vr&t| tt||d}d|g}tj| ||dd}	|| |	 S )zf
    Returns one list with the args for this language & dataset's charlm and pretrained embedding
    )	model_dirNz--wordvec_pretrain_fileF)warn)	r	   r   r   r   r
   r   r   r   choose_transformer)
languagedatasetcharlmcommand_args
extra_argsr   charlm_argswordvec_argswordvec_pretrain	bert_argsr   r   r   build_pretrain_args3   s   r+   c                 C   s   | dd\}}t|||j||}t|g }d|ddg}|| | | }|jd ur2|d|jg |jd ur?|d|jg t	|}	t
|	}
|
S )N_   --shorthand--modetrainz--save_namez
--save_dir)splitr+   r$   DATASET_EXTRA_ARGSget	save_nameextendsave_dirr   
parse_argsmodel_file_name)paths
short_namer%   r&   short_languager#   pretrain_argsdataset_args
train_argsargsr4   r   r   r   build_model_filenameF   s   



r@   c              
   C   s  |d }| d\}}	tj|| d}
tj|| d}tj|| d}dd |
||fD }t|dkretd	| d
| d zt| W n t	yd } zt
d| d| d|d }~ww t||	|j||}| tjkrt|g }d|
d|d|ddg}|| | | }td| t| | tjks| tjkrd|d|ddg}|| | }td| t| | tjks| tjkrd|d|ddg}|| | }td| t| d S d S )NNER_DATA_DIRr,   z.train.jsonz	.dev.jsonz
.test.jsonc                 S   s   g | ]
}t j|s|qS r   )ospathexists).0xr   r   r   
<listcomp>k   s    z run_treebank.<locals>.<listcomp>r   zThe data for z( is missing or incomplete.  Cannot find z  Attempting to rebuild...z9An exception occurred while trying to build the data for z0  At least one portion of the data was missing: z8  Please correctly build these files and then try again.z--train_filez--eval_filer.   r/   r0   z Running train step with args: {}predictzRunning dev step with args: {}zRunning test step with args: {})r1   rB   rC   joinlenloggerwarningr   main	ExceptionFileNotFoundErrorr+   r$   r   TRAINr2   r3   infoformatr   	SCORE_DEV
SCORE_TEST)moder9   treebankr:   temp_output_filer%   r&   ner_dirr"   r#   
train_filedev_file	test_filemissing_fileer<   r=   r>   dev_args	test_argsr   r   r   run_treebank`   sT   



r`   c                   C   s   t jtddtt td d S )Nner	nertagger)r@   )r   rM   r`   r   r   build_argparser@   r   r   r   r   rM      s   rM   __main__)__doc__loggingrB   stanza.modelsr   stanza.resources.commonr   stanza.utils.datasets.nerr   stanza.utils.trainingr   stanza.utils.training.commonr   r   r   r	   r
   !stanza.resources.default_packagesr   r   r   r   r2   	getLoggerrK   r   r+   r@   r`   rM   __name__r   r   r   r   <module>   s,    
	:
