o
    h(                     @   s   d Z ddlZddlZddlZddlZddlZddlZddlmZ ddlm	Z	 ddl
mZ ddlmZ ddlmZ e ZedZdd	d
Zdd ZdddZdd Zdd Zdd Zdd ZdddZedkrle  dS dS )zG
Entry point for training and evaluating a Bi-LSTM language identifier
    N)datetime)utils)
DataLoader)Trainer)get_tqdmstanzac                 C   s   t  }|jdddd |jddtdd |jd	d
td d |jdddd |jddd d |jddd d |jdddd |jddtdd |jdddd |jddtdd |jdddd |jd d!dd |jd"d#d d t| |j| d$} | S )%Nz--batch_modez*custom settings when running in batch mode
store_true)helpactionz--batch_sizezbatch size for training@   )r	   typedefaultz--eval_lengthzlength of strings to eval onz
--eval_setzeval on dev or testtest)r	   r   z
--data_dirz"directory with train/dev/test dataz--load_namezpath to load model fromz--modeztrain or evaltrainz--num_epochsznumber of epochs for training2   z--randomizez!take random substrings of samplesz--randomize_lengths_rangez1range of lengths to use when random sampling textz5,20z--merge_labels_for_evalzJmerge some language labels for eval (e.g. "zh-hans" and "zh-hant" to "zh")z--save_best_epochsz.save model for every epoch with new best scorez--save_namezwhere to save modelargs)argparseArgumentParseradd_argumentintrandomize_lengths_ranger   add_device_args
parse_args)r   parser r   X/var/www/html/env_mimamsha/lib/python3.10/site-packages/stanza/models/lang_identifier.pyr      s,   
r   c                 C   sF   dd |  dD }|d |d k s!J d|d  d|d  d|S )	z-
    Range of lengths for random samples
    c                 S   s   g | ]}t |qS r   )r   .0xr   r   r   
<listcomp>1   s    z+randomize_lengths_range.<locals>.<listcomp>,r      zInvalid range: (z, ))split)
range_listrange_boundariesr   r   r   r   -   s   .r   c                 C   s6   t | d} td | jdkrt|  d S t|  d S )Nr   r   r   )r   torchmanual_seedmodetrain_model
eval_modelr   r   r   r   main6   s
   


r,   c              	      s   i }i } fddt  jD }|D ]N}t|}|  d}W d    n1 s-w   Y  dd |D }|D ]%}|d }	|	|vrKt|||	< |d }
t|
D ]}||vr_t|||< qSq;qt||d< t||d< ||fS )	Nc                    $   g | ]}d |v r j  d| qS r   /data_dirr   r   r   r   r    B      $ z!build_indexes.<locals>.<listcomp>
c                 S   s   g | ]}|  rt|qS r   )stripjsonloads)r   liner   r   r   r    F   s    labeltextUNKz<PAD>)	oslistdirr1   openreadr4   r$   lenlist)r   
tag_to_idxchar_to_idxtrain_files
train_file	curr_filelinesexamplesexampler8   sequencecharr   r   r   build_indexes?   s,   
rK   c              	      sx  t  \}}t j} fddt jD }| j||| j t j} fddt jD }|j j|||d j	d  j
|| j|jd} jrb j|d< tt  d j  t| jd u jd	}d
}	td jd D ]}
tt  d|
  tt  dt|j  |j} jst|}|D ]}|d |d f}|| qtt  d t|| jd\}}}}}tt  d|  ||	kr$tt  d  jrd|
 nd }|j|d tt j
d}d|i||||fD ]}|t |d  qW d    n	1 sw   Y  |}	tt  d | j||| j qwd S )Nc                    r-   r.   r0   r   r   r   r   r    Z   r2   ztrain_model.<locals>.<listcomp>c                    r-   )devr/   r0   r   r   r   r   r    ^   r2   F	randomize
max_length)
model_pathrB   rA   
batch_sizelang_weights	load_namez	Loading model from: 
load_modeldevice        r"   z	Epoch z	Num training batches: 	sentencestargetsz(	Epoch complete. Evaluating on dev data.)
batch_modez	Current dev accuracy: z	New best score. Saving model.epoch)r8   wdev_accuracyr3   z	Resampling training data.)!rK   r   rV   r;   r<   r1   	load_datarQ   rN   eval_length	save_namerR   rS   loggerinfor   nowr   range
num_epochsr?   batchesrZ   tqdmupdateeval_trainersave_best_epochssaver=   score_log_pathwriter5   dumps)r   rA   rB   
train_datarC   dev_data	dev_filestrainer_configtrainerbest_accuracyr[   rf   train_batchinputscurr_dev_accuracycurr_confusion_matrixcurr_precisionscurr_recallscurr_f1smodel_labelscore_log_file	score_logr   r   r   r*   U   s`   




r*   c                 C   sB   t j| }|d r| dt|d    d}|S |  d}|S )zh
    Helper that will determine corresponding log file (e.g. /path/to/demo.pt to /path/to/demo.json
    r"   Nz.json)r;   pathsplitextr?   )	file_pathmodel_suffixrl   r   r   r   rl      s   
rl   c                    s.  d  j  jd}t|d jd}t j} fddt jD }|j j||j	j
|j	jd jd t|| j j d\}}}}}	tt  d	 j d
|   jrY jnt j }
tj|
rg jrt|
d}d|i||||	fD ]}|t|d  qvW d    d S 1 sw   Y  d S d S )N)rP   rS   rQ   TrT   c                    s&   g | ]} j |v r j d | qS )r/   )eval_setr1   r   r   r   r   r       s   & zeval_model.<locals>.<listcomp>FrM   )rZ   fine_grained	z accuracy: r\   r]   r3   )rS   rQ   r   rV   r   r;   r<   r1   r^   modelrB   rA   r_   ri   rZ   merge_labels_for_evalra   rb   r   rc   r   r`   rl   r   existsr=   rm   r5   rn   )r   rr   rs   	test_data
test_filescurr_accuracyrx   ry   rz   r{   eval_save_pathr}   r~   r   r   r   r+      s.   
 
"r+   FTc                    s  |j }|j}i  |D ]}i  |< |D ]}d | |< qq
|j}|s%t|}|D ]6}	|	d |	d f}
| |
}t|	d |D ]\}}|rG|| n|| dd  ||    d7  < q=q't fdd D }t fdd D }t|t| }d	d
i}d	di}d	di}|D ]%t fdd|D }|dkrt   t| |< qd|< q|D ]%t fdd|D }|dkrt   t| |< qd|< q|D ]'}|| dkr|| dkrd||< qd|| ||   || ||   ||< q| |||fS )zA
    Produce dev accuracy and confusion matrix for a trainer
    r   rX   rY   -r"   c                    s(   g | ] t  fd d  D qS )c                       g | ]}  | qS r   r   )r   j)confusion_matrixir   r   r           z+eval_trainer.<locals>.<listcomp>.<listcomp>)sum)r   r   )r   r   r       s   ( z eval_trainer.<locals>.<listcomp>c                    s   g | ]} | | qS r   r   )r   r   r   r   r   r       r   r   	precisionrecallf1c                    s   g | ]} |  qS r   r   r   k)r   prediction_labelr   r   r       r   rW   c                    r   r   r   r   )r   target_labelr   r   r       r   g       @)	rA   
idx_to_tagrf   rg   predictzipr$   r   float)rs   rp   rZ   r   rA   r   	row_label	col_labelrf   	dev_batchrv   predictions
target_idx
predictiontotal_examplestotal_correctr]   precision_scoresrecall_scores	f1_scorestotalr8   r   )r   r   r   r   ri      sP   



"ri   __main__)N)FT)__doc__r   r5   loggingr;   randomr'   r   stanza.models.commonr   stanza.models.langid.datar   stanza.models.langid.trainerr   stanza.utils.get_tqdmr   rg   	getLoggerra   r   r   r,   rK   r*   rl   r+   ri   __name__r   r   r   r   <module>   s2    


		8
8
