o
    –h£7  ã                   @   s<  d Z ddlZddlZddlZddlZddlZddlmZ ddlZddlZ	ddl
Z
ddlZddlmZmZ ddlmZ ddlmZ ddlmZ ddlmZmZ dd	lmZ ddlm  m  mZ dd
lT ddlm Z  ddl!m"Z" e #d¡Z$dd„ Z%ddd„Z&ddd„Z'dd„ Z(dd„ Z)dd„ Z*dd„ Z+e,dkrœe'ƒ  dS dS )a,  
Entry point for training and evaluating a lemmatizer.

This lemmatizer combines a neural sequence-to-sequence architecture with an `edit` classifier 
and two dictionaries to produce robust lemmas from word forms.
For details please refer to paper: https://nlp.stanford.edu/pubs/qi2018universal.pdf.
é    N)Údatetime)ÚnnÚoptim)Ú
DataLoader)ÚVocab)ÚTrainer)ÚscorerÚedit)Úutils)Ú*)ÚCoNLL)Ú_training_loggingÚstanzac                  C   s   t  ¡ } | jdtddd | jdtd dd | jdtd dd | jdtd d	d | jd
td d	d | jddddgd | jdtdd | jddddd | jdddd | jdtdd | jdtdd | jd td!d | jd"td#d | jd$td#d | jd%tdd | jd&td!d | jd'd(g d)¢d*d+ | jd,tdd | jd-td#d | jd.d/dd0d | jd1tttjƒd | jd2td3d | jd4d5dd6d | jd7d8dd9d | jd:dd;d | jd<td d=d | jd>td d?d | jd@td dAd | jdBtd3dCd | jdDtdEdFd | jdGtdHdId | jdJtdKd | jdLtdMdNd | jdOtdPd | jdQtdd | jdRtdSdTd | jdUtdVdWd | jdXtdYdZd | jd[td\d]d | jd^d_dd`da | jdbtdcd t	 
| ¡ | jdddded | jdfd dgdh | S )iNz
--data_dirz
data/lemmazDirectory for all lemma data.)ÚtypeÚdefaultÚhelpz--train_filezInput file for data loader.z--eval_filez--output_filezOutput CoNLL-U file.z--gold_filez--modeÚtrainÚpredict)r   Úchoicesz--shorthandz/Shorthand for the dataset to use.  lang_dataset)r   r   z	--no_dictÚensemble_dictÚstore_falsezADo not ensemble dictionary with seq2seq. By default use ensemble.)ÚdestÚactionr   z--dict_onlyÚ
store_truez)Only train a dictionary-based lemmatizer.)r   r   z--hidden_diméÈ   )r   r   z	--emb_dimé2   z--num_layersé   z--emb_dropoutg      à?z	--dropoutz--max_dec_lenz--beam_sizez--attn_typeÚsoft)r   ÚmlpÚlinearÚdeepzAttention type)r   r   r   z	--pos_dimz--pos_dropoutz	--no_editr	   zODo not use edit classifier in lemmatization. By default use an edit classifier.z
--num_editz--alphag      ð?z--no_posÚposz:Do not use UPOS in lemmatization. By default UPOS is used.z	--no_copyÚcopyzhDo not use copy mechanism in lemmatization. By default copy mechanism is used to improve generalization.z--charlmzVTurn on contextualized char embedding using pretrained character-level language model.z--charlm_shorthandz=Shorthand for character-level language model training corpus.z--charlm_forward_filez$Exact path to use for forward charlmz--charlm_backward_filez%Exact path to use for backward charlmz--sample_trainzSubsample training data.z--optimÚadamzsgd, adagrad, adam or adamax.z--lrgü©ñÒMbP?zLearning ratez
--lr_decaygÍÌÌÌÌÌì?z--decay_epoché   z&Decay the lr starting from this epoch.z--num_epoché<   z--batch_sizez--max_grad_normg      @zGradient clipping.z
--log_stepé   zPrint log every k steps.z
--save_dirzsaved_models/lemmazRoot dir for saving models.z--save_namez%{shorthand}_{embedding}_lemmatizer.ptzFile name to save the modelz
--caselessFznLowercase everything first before processing.  This will happen automatically if 100%% of the data is caseless)r   r   r   z--seediÒ  z--wandbzStart a wandb session and write the results of training.  Only applies to training.  Use --wandb_name instead to specify a namez--wandb_namezWName of a wandb session to start when training.  Will default to the dataset short name)r   r   )ÚargparseÚArgumentParserÚadd_argumentÚstrÚintÚfloatÚlenr	   Ú
EDIT_TO_IDr
   Úadd_device_args)Úparser© r1   úS/var/www/html/env_mimamsha/lib/python3.10/site-packages/stanza/models/lemmatizer.pyÚbuild_argparse!   s\   
r3   c                 C   sP   t ƒ }|j| d} | jrd| _t| ƒ} | d r | d  d¡d nd}|| d< | S )N©ÚargsTÚ	shorthandÚ_r   Ú Úlang)r3   Ú
parse_argsÚ
wandb_nameÚwandbÚvarsÚsplit)r5   r0   r9   r1   r1   r2   r:   Z   s   r:   c                 C   sP   t | d} t | d ¡ t d | d ¡¡ | d dkr"t| ƒ d S t| ƒ d S )Nr4   ÚseedzRunning lemmatizer in {} modeÚmoder   )r:   r
   Úset_random_seedÚloggerÚinfoÚformatr   Úevaluater4   r1   r1   r2   Úmaing   s   
rF   c                 C   s4   | j D ]}|jD ]}|j ¡ |jkr  dS qqdS )NFT)Ú	sentencesÚwordsÚtextÚlower)ÚdocÚsentenceÚwordr1   r1   r2   Úall_lowercases   s   

ÿÿrN   c                 C   sb   d}| d r| d rd}| d j | d |d}tj |¡d }| | d ¡s/tj | d |¡}|S )	NÚnocharlmÚcharlmÚcharlm_forward_fileÚ	save_namer6   )r6   Ú	embeddingr   Úsave_dir)rD   ÚosÚpathr>   Ú
startswithÚjoin)r5   rS   Ú
model_fileÚ	model_dirr1   r1   r2   Úbuild_model_filenamez   s   ÿr[   c           #      C   sØ  t  d | d ¡¡ tj| d d}t|| d | dd}|j}|d j| d< |d	 j| d
< tj| d d}t|| d | |dd}t 	| d ¡ t
| ƒ}t  d|¡ | d }| d }t | ¡ t|ƒdksjt|ƒdkrqt  d¡ d S | d s‚t|ƒr‚t  d¡ d| d< t  d|¡ t| || d d}	t  d¡ |	 | ¡ ¡ t  d¡ |	 |j ttg¡¡}
|j tg|
¡ t |j|¡ t ||¡\}}}t  d |d ¡¡ |  dd¡rÜ|	 |¡ d S | d r
dd l}| d rí| d nd | d!  }|j|| d" |jj d#d$d% |jj d&d'd% t  d(¡ d}t|ƒ| d)  }g }g }| d* }t! !¡ }d+}t"d,| d) d, ƒD ]}d}t#|ƒD ]=\}}t! !¡ }|d,7 }|	j$|dd-}||7 }|| d.  dkrut! !¡ | }t  | t% &¡  'd/¡|||| d) |||¡¡ q9t  d¡ g }
g }t#|ƒD ]\}}|	 (|| d0 ¡\}}|
|7 }
|d urŸ||7 }q„|	j)|j tg¡|
|d1}
|  d2d¡rÆt  d3¡ |	 *|j ttg¡|
¡}
|j tg|
¡ t |j|¡ t ||¡\}}} ||j+ | d  }t  d4 ||| ¡¡ | d rþ| ,|| d5œ¡ |d,ks
| t-|ƒkr|	 |¡ t  d6¡ |
}|| d7 kr6| |d8 kr6| d9 d:v r6|| d; 9 }|	 .|¡ || g7 }t  d<¡ q0t  d= |¡¡ | d rS| /¡  t-|ƒd t0 1|¡d, }!}"t  d> |!|"¡¡ d S )?Nz$[Loading data with batch size {}...]Ú
batch_sizeÚ
train_file©Ú
input_fileF)Ú
evaluationÚcharÚ
vocab_sizer!   Úpos_vocab_sizeÚ	eval_fileT©Úvocabr`   rT   zUsing full savename: %sÚoutput_fileÚ	gold_filer   z5[Skip training because no training data available...]ÚcaselesszBBuilding a caseless model, as all of the training data is caselesszBuilding lemmatizer in %sÚdevice)r5   rf   rj   z)[Training dictionary-based lemmatizer...]zEvaluating on dev set...zDev F1 = {:.2f}éd   Ú	dict_onlyr<   r;   z%s_lemmatizerr6   )ÚnameÚconfigÚ
train_lossÚmin)ÚsummaryÚ	dev_scoreÚmaxz&[Training seq2seq-based lemmatizer...]Ú	num_epochÚlrzJ{}: step {}/{} (epoch {}/{}), loss = {:.6f} ({:.3f} sec/batch), lr: {:.6f}r   )ÚevalÚlog_stepz%Y-%m-%d %H:%M:%SÚ	beam_size©Úeditsr   z'[Ensembling dict with seq2seq model...]z1epoch {}: train_loss = {:.6f}, dev_score = {:.4f})ro   rr   znew best model saved.Údecay_epochéÿÿÿÿr   )ÚsgdÚadagradÚlr_decayr8   zTraining ended with {} epochs.z#Best dev F1 = {:.2f}, at epoch = {})2rB   rC   rD   r   Ú	conll2docr   rf   Úsizer
   Ú
ensure_dirr[   Úprint_configr-   ÚwarningrN   r   Ú
train_dictÚraw_dataÚpredict_dictrK   ÚgetÚTEXTÚUPOSÚsetÚLEMMAÚwrite_doc2conllr   ÚscoreÚsaver<   ÚinitÚrunÚdefine_metricÚtimeÚrangeÚ	enumerateÚupdater   ÚnowÚstrftimer   ÚpostprocessÚensembleÚnum_examplesÚlogrs   Ú	update_lrÚfinishÚnpÚargmax)#r5   Ú	train_docÚtrain_batchrf   Údev_docÚ	dev_batchrY   Úsystem_pred_filerh   ÚtrainerÚ	dev_predsr7   Údev_fr<   r;   Úglobal_stepÚ	max_stepsÚdev_score_historyÚbest_dev_predsÚ
current_lrÚglobal_start_timeÚ
format_strÚepochro   ÚiÚbatchÚ
start_timeÚlossÚdurationÚ	dev_editsÚpredsrz   rr   Úbest_fÚ
best_epochr1   r1   r2   r   …   s¼   






ÿ€

€






r   c                 C   sø  | d }| d }t | ƒ}t|| d | d}|j|j}}| D ]}| d¡s.| d¡s.|dv r4| | ||< qt d | d	 ¡¡ tj	| d
 d}t
|| d	 ||dd}	t|	ƒdkrdt d | d ¡¡ d S | |	j ttg¡¡}
| dd¡rx|
}nWt d¡ g }g }t|	ƒD ]\}}| || d ¡\}}||7 }|d urŸ||7 }q…|j|	j tg¡||d}| dd¡rÄt d¡ | |	j ttg¡|¡}| ¡ rÏ| |	j|¡}|	j tg|¡ t |	j|¡ |d urút ||¡\}}}t d | d |d ¡¡ d S d S )Nrg   rh   rj   )rY   rj   r5   Ú_dirÚ_file)r6   z"Loading data with batch size {}...r\   rd   r^   Tre   r   zDSkip evaluation because no dev data is available...
Lemma score:
{} r6   rl   FzRunning the seq2seq model...rx   ry   r   z,[Ensembling dict with seq2seq lemmatizer...]z*Finished evaluation
Lemma score:
{} {:.2f}rk   )r[   r   r5   rf   ÚendswithrB   rC   rD   r   r€   r   r-   r„   r‡   rK   rˆ   r‰   rŠ   r•   r   r™   rš   Úhas_contextual_lemmatizersÚupdate_contextual_predsr‹   rŒ   r   r   rŽ   )r5   r¥   rh   rY   r¦   Úloaded_argsrf   ÚkrK   r²   Ú
dict_predsr·   rz   r±   ÚbÚpsÚesr7   rŽ   r1   r1   r2   rE     sN   €
€
ýrE   Ú__main__)N)-Ú__doc__ÚloggingÚsysrU   Úshutilr“   r   r'   ÚnumpyrŸ   ÚrandomÚtorchr   r   Ústanza.models.lemma.datar   Ústanza.models.lemma.vocabr   Ústanza.models.lemma.trainerr   Ústanza.models.lemmar   r	   Ústanza.models.commonr
   Ú%stanza.models.common.seq2seq_constantÚmodelsÚcommonÚseq2seq_constantÚconstantÚstanza.models.common.docÚstanza.utils.conllr   Ústanza.modelsr   Ú	getLoggerrB   r3   r:   rF   rN   r[   r   rE   Ú__name__r1   r1   r1   r2   Ú<module>   s@    

9
}6
ÿ