o
    h                     @   s  d dl Z d dlZd dlZd dlZd dlZd dlZd dlmZ d dlZd dl	m
Z
 d dlmZ d dlmZ d dlmZ d dlm  m  mZ d dlmZ d dlmZmZmZ d dlmZmZ d d	lmZmZm Z  G d
d deZ!G dd deZ"e#dZ$e#dZ%e#d&ej' dZ(dZ)dZ*	 dd Z+dddddddZ,ddddZ-d Z.d d d!Z/d"d#d$d%d&d'Z0d(d) Z1d*d+ Z2d@d,d-Z3d.d/ Z4d0d1 Z5		2dAd3d4Z6d5d6 Z7d7d8 Z8d9d: Z9d;d< Z:d@d=d>Z;e<d?kre;  dS dS )B    N)Enum)loss)utils)	CharVocab)Trainer)WVTypeExtraVectors	ModelType)add_peft_argsresolve_peft_args)format_confusionconfusion_to_accuracyconfusion_to_macro_f1c                   @   s   e Zd ZdZdZdZdZdS )Loss            N)__name__
__module____qualname__CROSSWEIGHTED_CROSS	LOG_CROSSFOCAL r   r   S/var/www/html/env_mimamsha/lib/python3.10/site-packages/stanza/models/classifier.pyr      s
    r   c                   @   s   e Zd ZdZdZdS )
DevScoringACCWFN)r   r   r   ACCURACYWEIGHTED_F1r   r   r   r   r      s    r   stanzazstanza.classifiers.trainerelmoformanylangsz#data/sentiment/en_sstplus.train.txtz#data/sentiment/en_sst3roots.dev.txtz$data/sentiment/en_sst3roots.test.txtc                 C   s@   |   } | sdS t| } t| tr| fS t| tr| S t| S )z~
    Returns a tuple of sizes to use in FC layers.

    For examples, converts "100" -> (100,)
    "100,200" -> (100,200)
    r   )stripastliteral_eval
isinstanceinttuple)argr   r   r   convert_fc_shapesh   s   


r+   g-C6*?g      ?gMbP?g-C6
?gMb@?)adamwadadeltasgd	adabeliefmadgradr.   g-q=gư>g:0yE>)r/   r-   r,   g?)r0   r.   g?-C6?{Gz?gv!>g>)r,   r-   r.   r/   r0   c               	   C   s  t  } | jdddddd | jdddd	d
 | jdtddd | jdtddd | jdtddd | jdtddd | jdtddd | jddddd
 | jddddd  | jd!ttd"d | jd#ttd$d | jd%ttd&d | jd'ddd(d  | jd)td*d+ | jd,td-d+ | jd.d/d0 tj	d1d2
d3d4 tD  d | jd5d6tjd7d8 | jd9d:tjd;d8 | jd<d=td>d8 | jd?d@tdAd8 | jdBd-tdCd8 | jdDdEtdFd8 | jdGdHtdId8 | jdJdKd0 tjdLd2
dMd4 tD  d | jdNdtdOd8 | jdPdtdQd8 | jdRdtdSd8 | jdTdUg dVdWdX | jdYdtjdZd8 | jd[d\ddd]d | jd^d\dd_d
 | jd`dad0 tjdbd | jdcddtded8 | jdftdgdhd | jditdjd+ | jdktddld | jdmtddnd | jdotdpdqd | jdrdsd0 dtdud | jdvtdgdwd | jdxdyd0 dzd{d | jd|tdd}d | jd~tddd | jdtddd | jdtddd | jddddd
 | jdddd | jdddddd | jdtddd | jdtddd | jdddddd | jddddd  | jddtdd8 | jddtdd8 | jdtddd | jdddddd | jdddddd | jddddd
 | jdtddd | jdtddd | jdddddd | jdtddd | jddddd  | jddddd  | jddddd
 | jdddddd | jddddd
 | jddddd  | jdddddd | jddddd  | jddddˍ | jdddd | jddtdd8 t|  t|  | S )z
    Build the argparse for the classifier.

    Refactored so that other utility scripts can use the same parser if needed.
    z--traintrainT
store_truezTrain the model (default))destdefaultactionhelpz
--no_trainstore_falsezDon't train the model)r5   r7   r8   z--shorthanden_ewtz'Treebank shorthand, eg 'en' for English)typer6   r8   z--load_nameNz"Name for loading an existing modelz
--save_dirzsaved_models/classifierzRoot dir for saving models.z--save_namezI{shorthand}_{embedding}_{bert_finetuning}_{classifier_type}_classifier.ptzName for saving the modelz--checkpoint_save_namez,File name to save the most recent checkpointz--no_checkpoint
checkpointzDon't save checkpointsz--save_intermediate_modelsFz1Save all intermediate models - this can be a lot!)r6   r7   r8   z--train_filez~Input file(s) to train a model from.  Each line is an example.  Should go <label> <tokenized sentence>.  Comma separated list.z
--dev_filez$Input file(s) to use as the dev set.z--test_filez%Input file(s) to use as the test set.z--output_predictionsz,Output predictions when running the test setz--max_epochsd   )r;   r6   z--tick2   z--model_typec                 S      t |   S N)r	   upperxr   r   r   <lambda>       z build_argparse.<locals>.<lambda>zModel type to use.  Options: %s c                 s       | ]}|j V  qd S r@   name.0rC   r   r   r   	<genexpr>       z!build_argparse.<locals>.<genexpr>z--filter_sizes)r   r      z1Filter sizes for the layer after the word vectors)r6   r;   r8   z--filter_channelsi  zNumber of channels for layers after the word vectors.  Int for same number of channels (scaled by width) for each filter, or tuple/list for exact lengths for each filterz--fc_shapesz400,100zExtra fully connected layers to put after the initial filters.  If set to blank, will FC directly from the max pooling to the output layer.z	--dropoutg      ?zDropout value to usez--batch_sizezBatch size when trainingz--batch_single_item   z(Items of this size go in their own batchz--dev_eval_batchesi  zURun the dev set after this many train batches.  Set to 0 to only do it once per epochz--dev_eval_scoringc                 S   r?   r@   )r   rA   rB   r   r   r   rD      rE   z?Scoring method to use for choosing the best model.  Options: %sc                 s   rG   r@   rH   rJ   r   r   r   rL      rM   z--weight_decayz1Weight decay (eg, l2 reg) to use in the optimizerz--learning_ratez%Learning rate to use in the optimizerz
--momentumz Momentum to use in the optimizerz--optimr-   )r-   r0   r.   z\Optimizer type: SGD, Adadelta, or madgrad.  Highly recommend to install madgrad and use that)r6   choicesr8   z--test_remap_labelszMap of which label each classifier label should map to.  For example, "{0:0, 1:0, 3:1, 4:1}" to map a 5 class sentiment test to a 2 class.  Any labels not mapped will be considered wrongz--forgive_unmapped_labelsforgive_unmapped_labelszrWhen remapping labels, such as from 5 class to 2 class, pick a different label if the first guess is not remapped.z--no_forgive_unmapped_labelszxWhen remapping labels, such as from 5 class to 2 class, DON'T pick a different label if the first guess is not remapped.z--lossc                 S   r?   r@   )r   rA   rB   r   r   r   rD      rE   zCWhether to use regular cross entropy or scale it by 1/log(quantity)z--loss_focal_gammar   zgamma value for a focal lossz--min_train_lenr   z&Filter sentences less than this lengthz--pretrain_max_vocabz--wordvec_pretrain_filez'Exact name of the pretrain file to readz--wordvec_raw_filez*Exact name of the raw wordvec file to readz--wordvec_dirzextern_data/wordveczDirectory of word vectorsz--wordvec_typec                 S   r?   r@   )r   rA   rB   r   r   r   rD      rE   word2vecz[Different vector types have different options, such as google 300d replacing numbers with #z--extra_wordvec_dimz+Extra dim of word vectors - will be trainedz--extra_wordvec_methodc                 S   r?   r@   )r   rA   rB   r   r   r   rD      rE   sumz8How to train extra dimensions of word vectors, if at allz--extra_wordvec_max_normz+Max norm for initializing the extra vectorsz--charlm_forward_filez$Exact path to use for forward charlmz--charlm_backward_filez%Exact path to use for backward charlmz--charlm_projectionz+Project the charlm values to this dimensionz--char_lowercasechar_lowercasez-Use lowercased characters in character model.z--elmo_modelzextern_data/manyelmo/englishzDirectory with elmo model)r6   r8   z
--use_elmouse_elmoz+Use an elmo model as a source of parametersz--elmo_projectionz$Project elmo to this many dimensionsz--bert_modelz>Use an external bert model (requires the transformers package)z--no_bert_model
bert_modelstore_constzDon't use bert)r5   r7   constr8   z--bert_finetunezFinetune the Bert modelz--bert_learning_rater2   z?Scale the learning rate for transformer finetuning by this muchz--bert_weight_decayr1   z>Scale the weight decay for transformer finetuning by this muchz--bert_hidden_layersr   z;How many layers of hidden state to use from the transformerz--bert_hidden_layers_originalbert_hidden_layersz&Use layers 2,3,4 of the Bert embedding)r7   rY   r5   r8   z--bilstmbilstmzUse a bilstm after the inputs, before the convs.  Using bilstm is about as accurate and significantly faster (because of dim reduction) than going straight to the filters)r5   r7   r6   r8   z--no_bilstmz6Don't use a bilstm after the inputs, before the convs.z--bilstm_hidden_dimi,  zDimension of the bilstm to usez--maxpool_widthr   z"Width of the maxpool kernel to usez--no_constituency_backpropconstituency_backpropzLWhen using a constituency parser, backprop into the parser's weights if Truez--constituency_modelz7/home/john/stanza_resources/it/constituency/vit_bert.ptzDWhich constituency model to use.  TODO: make this more user friendlyz--constituency_batch_normzJAdd a LayerNorm between the output of the parser and the classifier layersz--constituency_node_attnzZTrue means to make an attn layer out of the tree, with the words as key and nodes as queryz--no_constituency_node_attnconstituency_node_attnz--constituency_top_layerconstituency_top_layerzrTrue means use the top (ROOT) layer of the constituents.  Otherwise, the next layer down (S, usually) will be usedz--no_constituency_top_layerz--constituency_all_wordsz5Use all word positions in the constituency classifierz--no_constituency_all_wordsconstituency_all_wordszNUse the start and end word embeddings as inputs to the constituency classifierz--log_normsz=Log the parameters norms while training.  A very noisy optionz--wandbzStart a wandb session and write the results of training.  Only applies to training.  Use --wandb_name instead to specify a name)r7   r8   z--wandb_namezWName of a wandb session to start when training.  Will default to the dataset short namez--seedzRandom seed for model)argparseArgumentParseradd_argumentstrDEFAULT_TRAINDEFAULT_DEVDEFAULT_TESTr(   r	   CNNjoinr%   r&   r+   floatr   r!   r   r   r
   r   add_device_args)parserr   r   r   build_argparse   s   



rl   c                 C   sv   dd dd | jD  }|d| j  }| jr&|dd dd | jD   }tjt| d|| jjd	}t	
d
| |S )NzFS_%s_c                 S      g | ]}t |qS r   rc   rJ   r   r   r   
<listcomp>'      z(build_model_filename.<locals>.<listcomp>z_C_%d_z_FC_%s_c                 S   rn   r   ro   rJ   r   r   r   rp   *  rq   
classifier)shapeclassifier_typezExpanded save_name: %s)rh   filter_sizesfilter_channels	fc_shapesr   standard_model_file_namevars
model_typerI   loggerinfo)argsrs   model_save_filer   r   r   build_model_filename&  s   r   c                 C   s   t  }|| } t| t | jrd| _| j | _| jdu r&t	
| jd| _| jdu r3t
| jd| _| jdu r@t
| jd| _| S )zi
    Add arguments for building the classifier.
    Parses command line args and returns the result.
    TN)rl   
parse_argsr   tlogger
wandb_namewandboptimlowerweight_decayDEFAULT_WEIGHT_DECAYgetmomentumDEFAULT_MOMENTUMlearning_rateDEFAULT_LEARNING_RATES)r}   rk   r   r   r   r   0  s   




r   c                 C   s   |    dd t| jD }tj|dd}g }g }| D ]3}|| }| dd |D }tt|D ]}	t	||	 }
||

  }|| |||	 d  q2qt||}|S )Nc                 S   s   i | ]\}}||qS r   r   )rK   rC   yr   r   r   
<dictcomp>I      z'dataset_predictions.<locals>.<dictcomp>T)
keep_indexc                 S   s   g | ]}|d  qS r   r   rJ   r   r   r   rp   Q  rq   z'dataset_predictions.<locals>.<listcomp>r   )eval	enumeratelabelsdatasort_dataset_by_lenkeysrangelentorchargmaxitemappendr   unsort)modeldatasetindex_label_mapdataset_lengthspredictionso_idxlengthbatchoutputi	predictedpredicted_labelr   r   r   dataset_predictionsG  s    
r   c                 C   sP   i }|D ]}i ||< qt | |D ]\}}|j}|| |dd || |< q|S )z}
    Returns a confusion matrix

    First key: gold
    Second key: predicted
    so: confusion_matrix[gold][predicted]
    r   r   )zip	sentimentr   )r   r   r   confusion_matrixlabelr   datumexpected_labelr   r   r   confusion_dataset[  s   
r   Fc                    s0  |     du rdd t| jD  d}t|}| D ]x}|| } fdd|D }	| |}
tt|	D ]^}t	|
| }|
 }|r||v rN|| }n<d}|rg }tt|
| D ]}||
| | 
 |f q\|jdd	 d
 |D ]\}}||v r|| }d} nqv|sq6||	| kr|d }q6q|S )a  
    remap_labels: a dict from old label to new label to use when
    testing a classifier on a dataset with a simpler label set.
    For example, a model trained on 5 class sentiment can be tested
    on a binary distribution with {"0": "0", "1": "0", "3": "1", "4": "1"}

    forgive_unmapped_labels says the following: in the case that the
    model predicts "2" in the above example for remap_labels, instead
    treat the model's prediction as whichever label it gave the
    highest score
    Nc                 S      i | ]\}}||qS r   r   rK   r   rC   r   r   r   r   }  r   z!score_dataset.<locals>.<dictcomp>r   c                       g | ]} |j  qS r   r   rJ   	label_mapr   r   rp     r   z!score_dataset.<locals>.<listcomp>Fc                 S   s
   | d  S )Nr   r   rB   r   r   r   rD     s   
 zscore_dataset.<locals>.<lambda>)keyTr   )r   r   r   r   r   r   r   r   r   r   r   r   sort)r   r   r   remap_labelsrQ   correctr   r   r   expected_labelsr   r   r   r   founditemsjrm   r   r   r   r   score_datasetn  sD   

r   c           	      C   s   t | |}t||| j}tdt|| j t|\}}t|}td|t	||t	| f  td| || }|t
ju rH|||fS |t
ju rR|||fS td|)NzDev set confusion matrix:
{}z1Dev set: %d correct of %d examples.  Accuracy: %fMacro f1: {}zUnknown scoring method {})r   r   r   r{   r|   formatr   r   r   r   r   r    r!   
ValueError)	r   dev_setdev_eval_scoringr   r   r   totalmacro_f1accuracyr   r   r   score_dev_set  s   




r   c                 C   s6   t j| \}}|djdi ||j|d d | S )zc
    Build an informative intermediate checkpoint name from a base name, epoch #, and accuracy
    z%.E{epoch:04d}-{score_type}{acc:05.2f}r=   )epoch
score_typeaccNr   )ospathsplitextr   value)filenamer   dev_scoringscorerootextr   r   r   intermediate_name  s   &r   c                 C   sb   t d d}|  D ]\}}| |  }||7 }t d|| | | qt d| d S )Nz--- Model parameter sizes ---r   z  %s %d %d %dz  Total size: %d)r{   debugnamed_parameterselement_sizenelement)r   
total_sizerI   param
param_sizer   r   r   log_param_sizes  s   
r   c               
      s  t tj | j}| j}t| j t	
d   dd t|D  fddt|D dd }	|jtjkrBt	
d t }
nh|jtjkr\t	
d tjfd	d
|D dd}
nN|jtjkrvt	
d tjfdd
|D dd}
n4|jtjkrzddlm} W n ty   tdw t	
d|j dd }	||jd}
ntd|j|
  t|}| jdkrt|||j }t	
d | j!d urt	
d| j! t"| |j#rdd l#}|j$r|j$nd|j% }|j&||d |j'j(ddd |j'j(ddd |j'j(dd d |) D ]\}}|j*d d! }t	
d"|| q|j+r9| j,dkr9t-|| j,|j d#}| j.|dd$ t/| j,|j0D ]y| _,d#}d#}t1||j2|j3}|4  t	
d%| j, |j5rd|5  t|D ]\}}|  jd&7  _t	6d'|| j t78fd(d
|D }|9 D ]}|:  q||}|	|}|
||}|;  |9 D ]}|<  q||= 7 }|d& |j> dkrG||j> }t	
d)| j,d& |d& | |j#r|j?d*|i| jd+ |j@dkrA|d& |j@ dkrAt	
d, t|||j \}}}|j#r|j?||d-| jd+ | j!d u s|| j!kr4|| _!| j.|dd$ t	
d.||| j,d& |d& f  |4  |j5rA| j5  ||7 }d#}qh||7 }t	
d/| j,d& |f  t|||j \}}}|j#rs|j?|||d0| jd+ |r| j.|| j,d& d1 |j+rt-|| j,d& |j |}| j.|dd$ | j!d u s|| j!kr|| _!| j.|dd$ t	
d2||| j,d& f  q@|j#r|A  d S d S )3NzCurrent device: %sc                 S   r   r   r   r   r   r   r   r     r   ztrain_model.<locals>.<dictcomp>c                    s"   i | ]\}}|t j|d  dqS )F)requires_graddevice)r   tensorr   )r   r   r   r     s    c                 S   s   | S r@   r   rB   r   r   r   rD     s    ztrain_model.<locals>.<lambda>zCreating CrossEntropyLossz,Creating weighted cross entropy loss w/o logc                       g | ]} |d   qS r   r   rJ   r   r   r   rp         ztrain_model.<locals>.<listcomp>F)log_dampenedz+Creating weighted cross entropy loss w/ logc                    r   r   r   rJ   r   r   r   rp     r   Tr   )	FocalLossz^focal_loss not installed.  Must `pip install focal_loss_torch` to use the --loss=focal featurezCreating FocalLoss with loss %fc                 S   s   t j| ddS )Nr   )dim)r   softmaxrB   r   r   r   rD     s    )gammazUnknown loss function {}z&Reloaded model for continued training.zPrevious best score: %.5fz%s_classifier)rI   configr   max)summaryr   
epoch_lossminlrzoptimizer %s learning rate: %sg        )save_optimizerzStarting epoch %dr   zStarting batch: %d step %dc                    r   r   r   rJ   )label_tensorsr   r   rp     r   z[%d, %5d] Average loss: %.3f
train_loss)stepz---- Interim analysis ----)r   r   zQSaved new best score model!  Accuracy %.5f   Macro F1 %.5f   Epoch %5d   Batch %dz"Finished epoch %d  Total loss %.3f)r   r   r   )epochs_trainedzFSaved new best score model!  Accuracy %.5f   Macro F1 %.5f   Epoch %5d)Br   setLevelloggingDEBUGr   	optimizernext
parametersr   r{   r|   r   r   r   r   nnCrossEntropyLossr   weighted_cross_entropy_lossr   r   focal_loss.focal_lossr   ImportErrorloss_focal_gammar   r   tor   r   global_stepr   r   
best_scorer   r   r   	shorthandinitrundefine_metricr   param_groupssave_intermediate_modelsr   r   saver   
max_epochsshuffle_dataset
batch_sizebatch_single_itemr3   	log_normsr   r   stackvalues	zero_gradbackwardr   r   ticklogdev_eval_batchesfinish) trainer
model_filecheckpoint_filer}   	train_setr   r   r   r   process_outputsloss_functionr   train_set_by_lenrm   r   r   opt_nameopt
current_lrintermediate_filerunning_lossr   shuffled_batches	batch_numr   batch_labelsoutputs
batch_lossr   	dev_scorer   r   r   )r   r   r   r   train_model  s   











 
 
r*  c                 C   s  t | } t| j}td|  t| j t| }d }| j	r^t
| j| j| j}td| j  tdtt
|  ttj td| j | jr]t| j|| j}td| n| jsk|rg|| _ntdd }| j	r|d urtj|rtj|| | j	d}n| jrtj| j| | j	d}nt| |}|j !  | j	rt"| t t
j| j#| jd d}td	| j# td
t| tdt| t
$|j j%| t&|||| |||j j% | j'r|j '  t
j| j(| jd d}td| j(  t
$|j j%| | j)d u r9t*|j |}t+|||j j%}	| j,rtd| td-t.|	|j j% t/|	\}
}td-t0|	 nt1|j || j)| j2d}
t|}td|
||
| f  d S )NzUsing random seed: %dzUsing training set: %szTraining set has %d labelszSaving checkpoints: %szCheckpoint filename: %szFNo model provided and not asked to train a model.  This makes no sense)load_optimizer)min_lenzUsing dev set: %szTraining set has %d itemszDev set has %d itemszUsing test set: %szList of predictions: %szConfusion matrix:
{}r   )r   rQ   z2Test set: %d correct of %d examples.  Accuracy: %f)3r   r   set_random_seedseedr{   r|   
ensure_dirsave_dirr   r3   r   read_dataset
train_filewordvec_typemin_train_lenr   dataset_labelsr   r   r   r   r<   checkpoint_namecheckpoint_save_name	load_namer   r   r   existsr   loadbuild_new_modelr   log_configurationlog_training_argsdev_filecheck_labelsr   r*  r  	test_filetest_remap_labelsr   r   output_predictionsr   r   r   r   r   rQ   )r}   r.  	save_namer  r  r  r   test_setr   r   r   r   r   r   r   mainK  sp   


rE  __main__r@   )NNF)=r`   r%   r   r   randomreenumr   r   torch.nnr   stanza.models.commonr   r   stanza.models.pos.vocabr   stanza.models.classifiers.datamodelsclassifiersr   !stanza.models.classifiers.trainerr   stanza.models.classifiers.utilsr   r   r	    stanza.models.common.peft_configr
   r   stanza.utils.confusionr   r   r   r   r   	getLoggerr{   r   r   WARNINGrd   re   rf   r+   r   DEFAULT_LEARNING_EPSDEFAULT_LEARNING_RHOr   r   rl   r   r   r   r   r   r   r   r   r*  rE  r   r   r   r   r   <module>   sd    

=&
 


6	 
H
