o
    h37                     @   s   d Z ddlZddlmZ ddlZddlZddlZddlZddlZddl	Z	ddl
mZ ddlmZ ddlmZmZ ddlmZmZmZmZmZ ddlmZ ed	Zd
d ZdddZdd ZdddZdd Zdd Z e!dkrte  dS dS )a  
Entry point for training and evaluating a neural tokenizer.

This tokenizer treats tokenization and sentence segmentation as a tagging problem, and uses a combination of
recurrent and convolutional architectures.
For details please refer to paper: https://nlp.stanford.edu/pubs/qi2018universal.pdf.

Updated: This new version of tokenizer model incorporates the dictionary feature, especially useful for languages that
have multi-syllable words such as Vietnamese, Chinese or Thai. In summary, a lexicon contains all unique words found in 
training dataset and external lexicon (if any) is created during training and saved alongside the model after training.
Using this lexicon, a dictionary is created which includes "words", "prefixes" and "suffixes" sets. During data preparation,
dictionary features are extracted at each character position, to "look ahead" and "look backward" to see if any words formed
found in the dictionary. The window size (or the dictionary feature length) is defined at the 95-percentile among all the existing
words in the lexicon, this is to eliminate the less frequent but long words (avoid having a high-dimension feat vector). Prefixes 
and suffixes are used to stop early during the window-dictionary checking process.  
    N)copy)utils)Trainer)
DataLoaderTokenizationDataset)load_mwt_dict
eval_modeloutput_predictionsload_lexiconcreate_dictionary)_training_loggingstanzac                  C   sx  t  } | jdtdd | jdtddd | jdtdd	d | jd
tddd | jdtdd | jdtddd | jdtddd | jdtdd | jdtdd | jddddgd | jdddd | jdtdd d | jd!td"d#d | jd$td%d&d | jd'd(d)d*d+ | jd,d-d)d.d+ | jd/td0d1d | jd2dd3d | jd4tdd5d | jd6td7d8d | jd9dd:d | jd;td<d=d | jd>td?d@d | jdAtdBdCd | jdDtdEdFd | jdGtdHdId | jdJtdHdKd | jdLtdMdNd | jdOtdHdPd | jdQtdRdSd | jdTtdUdVd | jdWtdUdXd | jdYtdZd[d | jd\td]d^d | jd_tdd`d | jdatdbdcd | jddtdedfd | jdgtdhdid | jdjtd]dkd | jdltdmdnd | jdotdpdqd | jdrtddsd | jdttddud | jdvtdwdxd t|  | jdytdzd{ | jd|d}ddd~d | jdd}d)dd+ | jdddd | jdddd | S )z4
    If args == None, the system args are used.
    z
--txt_filezInput plaintext file)typehelpz--label_fileNzCharacter-level label file)r   defaultr   z--mwt_json_filezJSON file for MWT expansionsz--conll_filezCoNLL file for outputz--dev_txt_filez1(Train only) Input plaintext file for the dev setz--dev_label_filez7(Train only) Character-level label file for the dev setz--dev_conll_goldz<(Train only) CoNLL-U file for the dev set for early stoppingz--langLanguagez--shorthandzUD treebank shorthandz--modetrainpredict)r   choicesz--skip_newline
store_truez\Whether to skip newline characters in input. Particularly useful for languages like Chinese.)actionr   z	--emb_dim    zDimension of unit embeddingsz--hidden_dim@   zDimension of hidden unitsz--conv_filtersz1,9zbConfiguration of conv filters. ,, separates layers and , separates filter sizes in the same layer.z--no-residualresidualstore_falsezAdd linear residual connections)destr   r   z--no-hierarchicalhierarchicalz"Hierarchical" RNN tokenizerz--hier_invtemp      ?zSInverse temperature used in propagating tokenization predictions between RNN layersz--input_dropoutz Dropout input embeddings as wellz
--conv_resz)Convolutional residual layers for the RNNz--rnn_layers   zLayers of RNN in the tokenizerz--use_dictionarya(  Use dictionary feature. The lexicon is created using the training data and external dict (if any) expected to be found under the same folder of training dataset, formatted as SHORTHAND-externaldict.txt where each line in this file is a word. For example, data/tokenize/zh_gsdsimp-externaldict.txtz--max_grad_normg      ?z Maximum gradient norm to clip toz--annealg+?zHAnneal the learning rate by this amount when dev performance deterioratez--anneal_afteri  z2Anneal the learning rate no earlier than this stepz--lr0gMb`?zInitial learning ratez	--dropoutgQ?zDropout probabilityz--unit_dropoutzUnit dropout probabilityz--feat_dropoutg?z?Features dropout probability for each element in feature vectorz--feat_unit_dropoutz.The whole feature of units dropout probabilityz--tok_noiseg{Gz?z:Probability to induce noise to the input of the higher RNNz--sent_drop_probg?z|Probability to drop sentences at the end of batches during training uniformly at random.  Idea is to fake paragraph endings.z--last_char_drop_probzProbability to drop the last char of a block of text during training, uniformly at random.  Idea is to fake a document ending w/o sentence final punctuation, hopefully to avoid the tokenizer learning to always tokenize the last character as a periodz--weight_decayg        zWeight decayz--max_seqlend   z-Maximum sequence length to consider at a timez--batch_sizezBatch size to usez--epochs
   z#Total epochs to train the model forz--stepsiP  z7Steps to train the model for, if unspecified use epochsz--report_steps   z#Update step interval to report lossz--shuffle_stepsz8Step interval to shuffle each paragraph in the generatorz--eval_steps   zEStep interval to evaluate the model on the dev set for early stoppingz--max_steps_before_stopi  zJEarly terminates after this many steps if the dev scores are not improvingz--save_namezFile name to save the modelz--load_namezFile name to load a saved modelz
--save_dirzsaved_models/tokenizezDirectory to save models inz--seedi  )r   r   z	--use_mwtuse_mwtz}Whether or not to include mwt output layers.  If set to None, this will be determined by examining the training data for MWTs)r   r   r   r   z--no_use_mwtz+Whether or not to include mwt output layersz--wandbzStart a wandb session and write the results of training.  Only applies to training.  Use --wandb_name instead to specify a namez--wandb_namezWName of a wandb session to start when training.  Will default to the dataset short name)r   r   )argparseArgumentParseradd_argumentstrintfloatr   add_device_args)parser r,   R/var/www/html/env_mimamsha/lib/python3.10/site-packages/stanza/models/tokenizer.pybuild_argparse"   sh   
r.   c                 C   s*   t  }|j| d} | jrd| _t| } | S )NargsT)r.   
parse_args
wandb_namewandbvars)r0   r+   r,   r,   r-   r1   `   s   r1   c                 C   s^   | d d ur| d }n| d d }t jt j| d |s&t j|r&|S t j| d |S )N	save_name	shorthandz_tokenizer.ptsave_dir)ospathexistsjoin)r0   r5   r,   r,   r-   model_file_namej   s   
&r<   c                 C   s   t | d} t| d  td| d  g d| d< t| d | d< t| | d< tt	j
| d d	  | d d
krCt|  d S t|  d S )Nr/   seedzRunning tokenizer in {} modemode)space_beforecapitalizednumericend_of_parastart_of_para
feat_funcsfeat_dimr5   r   r   )r1   r   set_random_seedloggerinfoformatlenr<   
ensure_dirr8   r9   splitr   evaluater/   r,   r,   r-   maint   s   
rN   c                 C   s  | d rt | \}| d< t|}| d  | d d 7  < nd| d< d }d }t| d }| d | d d	}t| ||d
}|j}t|| d< | d | d d	}t| ||d|d}| d d u rs| | d< t	d
| d rldnd| d  t| |||| d d}	| d d urtj| d | d }
|	|
 |	| d  t|}| d d ur| d nt|| d  | d  d }| d }d}d}d}| d rdd l}| d r| d nd | d!  }|j|| d" |jjd#d$d% |jjd&d'd% td(|d( D ]}|j| d) | d* d+}|	|}|| d,  dkr(t	d-
||| | d r(|jd#|i|d. | d/ dkr<|| d/  dkr<|  || d0  dkrt| |	|||}| d r[|jd&|i|d. d1
|d2 g}|| d3 kr||k r|d4
||| d5  g7 }|| d5 9 }|	| |}||kr|d6g7 }|}|}|	| d7  n"|dkr|| | d8 kr|d9
|| g7 }t	d:|  n	t	d:| q| d r|  |dkrt	d;
|| d S t	d< |	| d7  d S )=Nuse_dictionarynum_dict_featrE      r   mwt_json_filetxt_file
label_filetxtlabel)input_files
dictionary
vocab_sizedev_txt_filedev_label_fileTrX   vocab
evaluationrY   r#   z9Found {}mwts in the training data.  Setting use_mwt to {} zno device)r0   r^   lexiconrY   ra   	load_namer7   lr0stepsepochs
batch_sizer   r3   r2   z%s_tokenizerr6   )nameconfig
train_lossmin)summary	dev_scoremaxr   unit_dropoutfeat_unit_dropout)rp   rq   report_stepszStep {:6d}/{:6d} Loss: {:.3f})stepshuffle_steps
eval_stepszDev score: {:6.3f}r   anneal_afterzlr: {:.6f} -> {:.6f}annealzNew best dev score!r5   max_steps_before_stopz4Stopping training after {} steps with no improvement	zBest dev score={} at step {}z,Dev set never evaluated.  Saving final model)r
   r   r   r   r^   rJ   r   has_mwtrG   rH   rI   r   r8   r9   r;   load	change_lrr(   r3   initrundefine_metricrangenextupdatelogshuffler   savefinish)r0   rb   rY   mwt_dicttrain_input_filestrain_batchesr^   dev_input_filesdev_batchestrainerrc   Nre   lrprev_dev_scorebest_dev_scorebest_dev_stepr3   r2   rs   batchlossrn   reportsr,   r,   r-   r      s   "
0

 






r   c                 C   s   t | d }t| d p| d | d d}|j|j}}|D ]}|ds.|dvr.|| | |< q| d | d	 d
}t| ||d|jd}t| d ||||| d \}}	}
}
t	d
||	 d ||	 d S )NrR   rc   r5   ra   )
model_filera   _file)ra   r>   r7   rc   r5   rS   rT   rU   Tr]   
conll_file
max_seqlenz OOV rate: {:6.3f}% ({:6d}/{:6d})r   )r   r   r0   r^   endswithr   rY   r	   rG   rH   rI   )r0   r   r   loaded_argsr^   keval_input_filesbatches	oov_countr   _r,   r,   r-   rM      s   " rM   __main__)N)"__doc__r$   r   loggingrandomnumpynpr8   torchjsonstanza.models.commonr   "stanza.models.tokenization.trainerr   stanza.models.tokenization.datar   r    stanza.models.tokenization.utilsr   r   r	   r
   r   stanza.modelsr   	getLoggerrG   r.   r1   r<   rN   r   rM   __name__r,   r,   r,   r-   <module>   s0    

>


f
