o
    h+                     @   s  d dl Z d dlZe jeZe jeZe jeZeje d dlZd dlZd dl Z d dl	m
Z
mZmZmZ d dlmZ d dlmZ d dlZd dlmZ d dlZd dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
l m!Z! d dl"m#Z# e# Z$e%dZ&dee'ee(e)f f dee'ee'e'f f de)fddZ*d'dee
 dee
 defddZ+g fdej,dej-deee(  deee'  dej-f
ddZ.d(dej,de(d e/d!e/deeee)e)f f
d"d#Z0d)d$d%Z1e2d&kre1  dS dS )*    N)AnyListTupleMapping)defaultdict)random)default_device)utils)LemmaClassifier)LemmaClassifierLSTM)LemmaClassifierWithTransformer)format_confusion)get_tqdmzstanza.lemmaclassifiermcc_results	confusionreturnc                 C   sP   d}d}| D ]}|  | d}t| | }||| 7 }||7 }q|| S )z
    Computes the weighted F1 score across an evaluation set.

    The weight of a class's F1 score is equal to the number of examples in evaluation. This makes classes that have more
    examples in the evaluation more impactful to the weighted f1.
    r   f1)getsumvalues)r   r   num_total_examplesweighted_f1class_idclass_f1num_class_examples r   i/var/www/html/env_mimamsha/lib/python3.10/site-packages/stanza/models/lemma_classifier/evaluate_models.pyget_weighted_f1#   s   
r   Tgold_tag_sequencespred_tag_sequenceslabel_decoderc                    s  t | t |ksJ dt |  dt | tdd  dd | D }t| |D ]\}} ||  ||   d7  < q*tdd }  D ]jz i d	t fd
d  D  }W n tyn   d}Y nw z i d	t i   }	W n ty   d}	Y nw zd||	  ||	  }
W n ty   d}
Y nw ||	|
d|< qG|r|D ]%}t	
d| dd|| d   dd|| d   dd|| d    qt| }| |fS )a-  
    Evaluates a model's predicted tags against a set of gold tags. Computes precision, recall, and f1 for all classes.

    Precision = true positives / true positives + false positives
    Recall = true positives / true positives + false negatives
    F1 = 2 * (Precision * Recall) / (Precision + Recall)

    Returns:
        1. Multi class result dictionary, where each class is a key and maps to another map of its F1, precision, and recall scores.
           e.g. multiclass_results[0]["precision"] would give class 0's precision.
        2. Confusion matrix, where each key is a gold tag and its value is another map with a key of the predicted tag with value of that (gold, pred) count.
           e.g. confusion[0][1] = 6 would mean that for gold tag 0, the model predicted tag 1 a total of 6 times.
    z Length of gold tag sequences is z,, while length of predicted tag sequence is c                   S      t tS N)r   intr   r   r   r   <lambda>G       z$evaluate_sequences.<locals>.<lambda>c                 S   s   i | ]\}}||qS r   r   ).0xyr   r   r   
<dictcomp>I   s    z&evaluate_sequences.<locals>.<dictcomp>   c                   S   r!   r"   )r   floatr   r   r   r   r$   M   r%   r   c                    s    g | ]}  |i  d qS )r   )r   )r&   kr   gold_tagr   r   
<listcomp>R   s     z&evaluate_sequences.<locals>.<listcomp>g           )	precisionrecallr   zLemma 'z' had precision d   r1   z	, recall r2   z and F1 score of r   )lenr   itemszipkeysr   r   ZeroDivisionErrorr   loggerinfor   )r   r   r    verbosereverse_label_decodergoldpredmulti_class_resultprecr2   r   lemmar   r   r-   r   evaluate_sequences6   sB   4,H

rB   modelposition_indices	sentences	upos_tagsc                 C   sH   t   | |||}t j|dd}W d   |S 1 sw   Y  |S )aS  
    A LemmaClassifierLSTM or LemmaClassifierWithTransformer is used to predict on a single text example, given the position index of the target token.

    Args:
        model (LemmaClassifier): A trained LemmaClassifier that is able to predict on a target token.
        position_indices (Tensor[int]): A tensor of the (zero-indexed) position of the target token in `text` for each example in the batch.
        sentences (List[List[str]]): A list of lists of the tokenized strings of the input sentences.

    Returns:
        (int): The index of the predicted class in `model`'s output.
    r*   )dimN)torchno_gradargmax)rC   rD   rE   rF   logitspredicted_classr   r   r   model_predicto   s   

rM   F	eval_pathr;   is_trainingc              	   C   s  t  }| | |s|   tj|| jdd}td|  d\}}|jg }}	t	|dD ]'\}
}}}t
| ||
|}|||k}|t|7 }|t|7 }|	| 7 }	q.td || }t||	|j|d\}}}|rtd| d	| d
| d td|j  ||||fS )ah  
    Helper function for model evaluation

    Args:
        model (LemmaClassifierLSTM or LemmaClassifierWithTransformer): An instance of the LemmaClassifier class that has architecture initialized which matches the model saved in `model_path`.
        model_path (str): Path to the saved model weights that will be loaded into `model`.
        eval_path (str): Path to the saved evaluation dataset.
        verbose (bool, optional): True if `evaluate_sequences()` should print the F1, Precision, and Recall for each class. Defaults to True.
        is_training (bool, optional): Whether the model is in training mode. If the model is training, we do not change it to eval mode.

    Returns:
        1. Multi-class results (Mapping[int, Mapping[str, float]]): first map has keys as the classes (lemma indices) and value is
                                                                    another map with key of "f1", "precision", or "recall" with corresponding values.
        2. Confusion Matrix (Mapping[int, Mapping[int, int]]): A confusion matrix with keys equal to the index of the gold tag, and a value of the
                                                               map with the key as the predicted tag and corresponding count of that (gold, pred) pair.
        3. Accuracy (float): the total accuracy (num correct / total examples) across the evaluation set.
    F)r    shufflezEvaluating on evaluation file )r   r   z"Evaluating examples from data filez3Finished evaluating on dataset. Computing scores...)r;   
Accuracy: z (/)zLabel decoder: )r   toevalr	   Datasetr    r9   r:   labelstqdmrM   rH   r   r4   tolistrB   )rC   rN   r;   rO   devicedatasetcorrecttotal	gold_tags	pred_tagsrE   pos_indicesrF   rW   r>   correct_predsaccuracy
mc_resultsr   r   r   r   r   evaluate_model   s*   

rd   c           	      C   s  t  }|jdtddd |jdtddd |jdtd	d
d |jdtd dd |jddddd |jdtd dd |jdttjtjt	dddd |jdttjtjt	dddd |jdttjtjt	dddd |jdtd d!d |jd"td d#d |jd$td%d& |s|
| n|} td' t| } | D ]}t| d(| |   qtd) td*| d+  d,| d-   t| d+ | }t|| d- \}}}}td.t|  td/ td0t| td/ td1|  td/ td2|  ||||fS )3Nz--vocab_sizei'  zNumber of tokens in vocab)typedefaulthelpz--embedding_dimr3   z?Number of dimensions in word embeddings (currently using GloVe)z--hidden_dim   zSize of hidden layerz--wordvec_pretrain_filez'Exact name of the pretrain file to readz--charlm
store_trueFz(Whether not to use the charlm embeddings)actionrf   rg   z--charlm_shorthandz=Shorthand for character-level language model training corpus.z--charlm_forward_filecharlm_filesz1billion_forward.ptzPath to forward charlm filez--charlm_backward_filez1billion_backwards.ptzPath to backward charlm filez--save_namesaved_modelszlemma_classifier_model.ptzPath to model save filez--model_typerobertaz8Which transformer to use ('bert' or 'roberta' or 'lstm')z--bert_modelz>Use a specific transformer instead of the default bert/robertaz--eval_filezpath to evaluation file)re   rg   z0Running training script with the following args:z: z<------------------------------------------------------------z$Attempting evaluation of model from 	save_namez	 on file 	eval_filezMCC Results: .______________________________________________zConfusion:
%srQ   zWeighted f1: )argparseArgumentParseradd_argumentr#   strospathjoindirname__file__
parse_argsr9   r:   varsr
   loadrd   dictr   )	argspredefined_argsparserargrC   r   r   accr   r   r   r   main   s<   &&&




r   __main__)T)TF)NN)3ru   sysrv   rx   ry   	parentdirappendloggingrq   typingr   r   r   r   collectionsr   numpyr   rH   torch.nnnnstanzastanza.models.common.utilsr   stanza.models.lemma_classifierr	   )stanza.models.lemma_classifier.base_modelr
   )stanza.models.lemma_classifier.lstm_modelr   0stanza.models.lemma_classifier.transformer_modelr   stanza.utils.confusionr   stanza.utils.get_tqdmr   rX   	getLoggerr9   r#   rt   r+   r   rB   ModuleTensorrM   boolrd   r   __name__r   r   r   r   <module>   s@    
6 89.
5,
