o
    –h/=  ã                   @   sL  d Z ddlZddlZddlZddlZddlmZ ddlZddlZddlZddl	Z
ddlZddlZddlmZmZ ddlZddlmZmZ ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlm  m  m Z! ddl"m#Z# ddl$m%Z% ddl&m'Z' e (d¡Z)dd„ Z*ddd„Z+ddd„Z,dd„ Z-dd„ Z.e/dkr¤e,ƒ  dS dS )aÂ  
Entry point for training and evaluating a multi-word token (MWT) expander.

This MWT expander combines a neural sequence-to-sequence architecture with a dictionary
to decode the token into multiple words.
For details please refer to paper: https://nlp.stanford.edu/pubs/qi2018universal.pdf

In the case of a dataset where all of the MWT exactly split into the words
composing the MWT, a classifier over the characters is used instead of the seq2seq
é    N)Údatetime)ÚnnÚoptim)Ú
DataLoaderÚBinaryDataLoader)Úmwts_composed_of_words)ÚVocab)ÚTrainer)Úscorer)Úutils)ÚDocument)ÚCoNLL)Ú_training_loggingÚstanzac                  C   s¨  t  ¡ } | jdtddd | jdtd dd | jdtd dd | jdtd d	d | jd
td d	d | jddddgd | jdtdd | jdtdd | jddddd | jdddd | jdddd | jdtd d! | jd"td#d! | jd$td d%d | jd&td'd! | jd(td'd! | jd)td#d! | jd*td+d! | jd,d-g d.¢d/d0 | jd1d2dd3d | jd4d5td6d7 | jd8d dd9d: | jd;d<dd=d | jd>td?d@d | jdAtdBdCd | jdDtdEdFd | jdGtdHd! | jdItdJdKd | jdLtdJd! | jdMtd#d! | jdNtdOdPd | jdQtdRdSd | jdTtdUdd | jdVtd dWd | jdXtd dYd | jdZtd[d! t | ¡ | jd\dd]d | jd^d d_d` | S )aNz
--data_dirzdata/mwtzRoot dir for saving models.)ÚtypeÚdefaultÚhelpz--train_filezInput file for data loader.z--eval_filez--output_filezOutput CoNLL-U file.z--gold_filez--modeÚtrainÚpredict)r   Úchoicesz--langÚLanguage)r   r   z--shorthandzTreebank shorthandz	--no_dictÚensemble_dictÚstore_falsezDDo not ensemble dictionary with seq2seq. By default ensemble a dict.)ÚdestÚactionr   z--ensemble_early_stopÚ
store_truez-Early stopping based on ensemble performance.)r   r   z--dict_onlyz+Only train a dictionary-based MWT expander.z--hidden_diméd   )r   r   z	--emb_dimé2   z--num_layerszONumber of layers in model encoder.  Defaults to 1 for seq2seq, 2 for classifierz--emb_dropoutg      à?z	--dropoutz--max_dec_lenz--beam_sizeé   z--attn_typeÚsoft)r   ÚmlpÚlinearÚdeepzAttention type)r   r   r   z	--no_copyÚcopyzhDo not use copy mechanism in MWT expansion. By default copy mechanism is used to improve generalization.z--augment_aposg{®Gáz„?u;   At training time, how much to augment |'| to |"| |â€™| |Ê¼|)r   r   r   z--force_exact_pieceszƒIf possible, make the text of the pieces of the MWT add up to the token itself.  (By default, this is determined from the dataset.))r   r   r   z--no_force_exact_piecesÚforce_exact_piecesz|Don't make the text of the pieces of the MWT add up to the token itself.  (By default, this is determined from the dataset.)z--sample_traing      ð?zSubsample training data.z--optimÚadamzsgd, adagrad, adam or adamax.z--lrgü©ñÒMbP?zLearning ratez
--lr_decaygÍÌÌÌÌÌì?z--decay_epoché   z&Decay the lr starting from this epoch.z--num_epochz--batch_sizez--max_grad_normg      @zGradient clipping.z
--log_stepé   zPrint log every k steps.z
--save_dirzsaved_models/mwtz--save_namezFile name to save the modelz--save_each_namez@Save each model in sequence to this pattern.  Mostly for testingz--seediÒ  z--wandbzStart a wandb session and write the results of training.  Only applies to training.  Use --wandb_name instead to specify a namez--wandb_namezWName of a wandb session to start when training.  Will default to the dataset short name)r   r   )ÚargparseÚArgumentParserÚadd_argumentÚstrÚintÚfloatr   Úadd_device_args)Úparser© r0   úU/var/www/html/env_mimamsha/lib/python3.10/site-packages/stanza/models/mwt_expander.pyÚbuild_argparse'   sR   
r2   c                 C   s"   t ƒ }|j| d} | jrd| _| S )N©ÚargsT)r2   Ú
parse_argsÚ
wandb_nameÚwandb)r4   r/   r0   r0   r1   r5   Y   s
   r5   c                 C   sV   t | d} t | j¡ t| ƒ} t d | d ¡¡ | d dkr%t| ƒ d S t	| ƒ d S )Nr3   zRunning MWT expander in {} modeÚmoder   )
r5   r   Úset_random_seedÚseedÚvarsÚloggerÚinfoÚformatr   Úevaluater3   r0   r0   r1   Úmainb   s   
r@   c           &      C   s^  t  d| d  ¡ t  d | d ¡¡ tj| d d}t|| d | dd}|j}|j| d	< tj| d
 d}t|| d | |dd}t 	| d ¡ | d rO| d nd | d ¡}t
j | d |¡}d }| d rut
j | d | d ¡}t |¡}| d }	| d }
t|ƒdkrŠt  d¡ d S | d¡}t|ƒdkr¤|  dd¡r¤t  d¡ d| d< | d r°t|ƒs°tdƒ‚| d d u rÃt|ƒrÃt  d¡ d| d< | d rét  d¡ t|| d | dd}|j}|j| d	< t|| d | |dd}| d d u rü| d rød| d< nd| d< t| || d d }t  d!¡ | |jjdd¡ t  d"¡ | |jjdd¡}t |j¡}|j|dd# t ||	¡ t |	|
¡\}}}t  d$ |d% ¡¡ |  dd¡rW|  |¡ d S t  d&¡ d}t! "t|ƒ| d  ¡}|| d'  }g }g }| d( }t# #¡ }d)}| d* r¬dd l$}| d+ r| d+ nd,| d  }|j%|| d- |j&j'd.d/d0 |j&j'd1d2d0 t(d| d' d ƒD ]
}d}t)| *¡ ƒD ]=\}}t# #¡ }|d7 }|j+|dd3} || 7 }|| d4  dkrüt# #¡ | }!t  | t, -¡  .d5¡|||| d' | |!|¡¡ qÀ|r|  || ¡ t  d6||| f ¡ t  d"¡ g }t)| *¡ ƒD ]\}}| /|¡}"||"7 }q |  d7d¡rM|  d8d¡rMt  d9¡ | 0|jjdd|¡}t |j¡}|j|dd# t ||	¡ t |	|
¡\}}}#||j1 | d  }t  d: |||#¡¡ | d* r‰| 2||#d;œ¡ |dks•|#t3|ƒkr¡|  |¡ t  d<¡ |}|| d= krº|#|d> krº|| d? 9 }| 4|¡ ||#g7 }qµt  d@ |¡¡ | d* rÒ| 5¡  t3|ƒd% t6 7|¡d }$}%t  dA |$|%¡¡ |  d7d¡r-t  d9¡ | 0|jjdd|¡}t |j¡}|j|dd# t ||	¡ t |	|
¡\}}}#t  dB |#d% ¡¡ t3|$|#ƒ}$d S d S )CNúmax_dec_len: %dÚmax_dec_lenú"Loading data with batch size {}...Ú
batch_sizeÚ
train_file©Ú
input_fileF©Ú
evaluationÚ
vocab_sizeÚ	eval_fileT©ÚvocabrI   Úsave_dirÚ	save_nameú{}_mwt_expander.ptÚ	shorthandÚsave_each_nameÚoutput_fileÚ	gold_filer   z*Skip training because no data available...Ú	dict_onlyzRTraining data available, but dev data has no MWTs.  Only training a dict based MWTr$   ztCannot train model with --force_exact_pieces, as the MWT in this dataset are not entirely composed of their subwordszoTrain MWTs entirely composed of their subwords.  Training the MWT to match that paradigm as closely as possiblez Reconverting to BinaryDataLoaderÚ
num_layersé   r   Údevice)r4   rM   rX   z)Training dictionary-based MWT expander...zEvaluating on dev set...©Úfake_dependencieszDev F1 = {:.2f}r   z&Training seq2seq-based MWT expander...Ú	num_epochÚlrzJ{}: step {}/{} (epoch {}/{}), loss = {:.6f} ({:.3f} sec/batch), lr: {:.6f}r7   r6   z%s_mwt)ÚnameÚconfigÚ
train_lossÚmin)ÚsummaryÚ	dev_scoreÚmax)ÚevalÚlog_stepz%Y-%m-%d %H:%M:%SzSaved epoch %d model to %sr   Úensemble_early_stopz'[Ensembling dict with seq2seq model...]z1epoch {}: train_loss = {:.6f}, dev_score = {:.4f})r_   rb   znew best model saved.Údecay_epochéÿÿÿÿÚlr_decayzTraining ended with {} epochs.z#Best dev F1 = {:.2f}, at epoch = {}zEnsemble dev F1 = {:.2f})8r<   Údebugr>   r   Ú	conll2docr   rM   Úsizer   Ú
ensure_dirÚosÚpathÚjoinÚbuild_save_each_filenameÚlenÚwarningÚget_mwt_expansionsÚgetr   Ú
ValueErrorr=   r   r	   Ú
train_dictÚdocÚpredict_dictr#   ÚdeepcopyÚset_mwt_expansionsÚwrite_doc2conllr
   ÚscoreÚsaveÚmathÚceilÚtimer7   ÚinitÚrunÚdefine_metricÚrangeÚ	enumerateÚ	to_loaderÚupdater   ÚnowÚstrftimer   ÚensembleÚnum_examplesÚlogrc   Ú	change_lrÚfinishÚnpÚargmax)&r4   Ú	train_docÚtrain_batchrM   Údev_docÚ	dev_batchrO   Ú
model_filerR   Úsystem_pred_filerT   Údev_mwtÚtrainerÚ	dev_predsrx   Ú_Údev_fÚglobal_stepÚsteps_per_epochÚ	max_stepsÚdev_score_historyÚbest_dev_predsÚ
current_lrÚglobal_start_timeÚ
format_strr7   r6   Úepochr_   ÚiÚbatchÚ
start_timeÚlossÚdurationÚpredsrb   Úbest_fÚ
best_epochr0   r0   r1   r   o   sò   












ÿ€








ør   c                 C   sè  | d }| d }| d r| d nd  | d ¡}| d r1| | d ¡s1tj |¡s1tj | d |¡}t|| d d}|j|j}}| D ]}| 	d	¡sR| 	d
¡sR|dv rX| | ||< qBt
 d|d  ¡ t
 d  | d ¡¡ tj| d d}t|| d ||dd}	t|	ƒdkrÁ| |	jjdd¡}
|d r–|
}n-t
 d¡ g }t|	 ¡ ƒD ]\}}|| |¡7 }q£| dd¡rÀ| |	jjdd|¡}ng }t |	j¡}|j|dd t ||¡ |d uròt ||¡\}}}t
 d  | d |d ¡¡ d S d S )NrS   rT   rO   rP   rQ   rN   rX   )r–   rX   Ú_dirÚ_file)rQ   rA   rB   rC   rD   rK   rF   TrL   r   rH   rU   zRunning the seq2seq model...r   FrY   zMWT expansion score: {} {:.2f}r   )r>   Ú
startswithrn   ro   Úexistsrp   r	   r4   rM   Úendswithr<   rj   r   rk   r   rr   ry   rx   rt   r=   r†   r‡   r   ru   r‹   r#   rz   r{   r|   r
   r}   )r4   r—   rT   r–   r™   Úloaded_argsrM   Úkrx   r§   Ú
dict_predsr«   r¦   Úbr›   r}   r0   r0   r1   r?     sD   "€
€ýr?   Ú__main__)N)0Ú__doc__Úsysrn   Úshutilr   r   r(   Úloggingr   Únumpyr   ÚrandomÚtorchr   r   r#   Ústanza.models.mwt.datar   r   Ústanza.models.mwt.utilsr   Ústanza.models.mwt.vocabr   Ústanza.models.mwt.trainerr	   Ústanza.models.mwtr
   Ústanza.models.commonr   Ú%stanza.models.common.seq2seq_constantÚmodelsÚcommonÚseq2seq_constantÚconstantÚstanza.models.common.docr   Ústanza.utils.conllr   Ústanza.modelsr   Ú	getLoggerr<   r2   r5   r@   r   r?   Ú__name__r0   r0   r0   r1   Ú<module>   sD    

2
	 !2
ÿ