o
    h+                     @   s   d dl Z d dlmZ d dlZd dlZd dlZd dlmZmZm	Z	 d dl
mZmZ d dlmZmZ d dlmZ d dlmZ d dlmZ d dlmZ ed	ZG d
d deZdS )    N)pad_sequencepack_padded_sequencepad_packed_sequence)CharacterModelCharacterLanguageModel)ListTuple)UNK_ID)utils)LemmaClassifier)	ModelTypezstanza.lemmaclassifierc                       st   e Zd ZdZ	d fdd	Zdd Zdeee  fd	d
Zdee	 deee  deee	  fddZ
dd Z  ZS )LemmaClassifierLSTMa*  
    Model architecture:
        Extracts word embeddings over the sentence, passes embeddings into a bi-LSTM to get a sentence encoding.
        From the LSTM output, we get the embedding of the specific token that we classify on. That embedding
        is fed into an MLP for classification.
    FNc                    s  t t| ||| || _|d | _d| _| jd | _|j}| dt	j
j|dd dd t|jD | _|jd | _|jd	 | _|| _d
d t|D | _t	j
t|d	 | jdd| _t	jj| jjdd |  j| j7  _|	| _| jr|
du s~tj|
std|
 |du stj|std| | dtj|
dd | dtj|dd |  j| j  | j!  7  _| jd | _"|| _#| j"dkr| j#durt	j
t| j#| j"dd| _$|  j| j"7  _t%| & j'}| jdkrt()| j| j| _t	j*| j| jdd+|| _,t-.d| j d| j d nt	j/| j| jddd| _0t-.d | jdkr2| jd n| j}t	1t	2|dt	3 t	2d|| _4dS )a`  
        Args:
            vocab_size (int): Size of the vocab being used (if custom vocab)
            output_dim (int): Size of output vector from MLP layer
            upos_to_id (Mapping[str, int]): A dictionary mapping UPOS tag strings to their respective IDs
            pt_embedding (Pretrain): pretrained embeddings
            known_words (list(str)): Words which are in the training data
            target_words (set(str)): a set of the words which might need lemmatization
            use_charlm (bool): Whether or not to use the charlm embeddings
            charlm_forward_file (str): The path to the forward pass model for the character language model
            charlm_backward_file (str): The path to the forward pass model for the character language model.

        Kwargs:
            upos_emb_dim (int): The size of the UPOS tag embeddings
            num_heads (int): The number of heads to use for attention. If there are more than 0 heads, attention will be used instead of the LSTM.

        Raises:
            FileNotFoundError: if the forward or backward charlm file cannot be found.
        
hidden_dimr   	num_heads
embeddingsT)freezec                 S   s   i | ]\}}| d d|qS )     )replace).0iword r   d/var/www/html/env_mimamsha/lib/python3.10/site-packages/stanza/models/lemma_classifier/lstm_model.py
<dictcomp>6       z0LemmaClassifierLSTM.__init__.<locals>.<dictcomp>   c                 S   s   i | ]\}}||qS r   r   )r   idxr   r   r   r   r   ;       )num_embeddingsembedding_dimpadding_idxg{Gz?)stdNz(Could not find forward character model: z)Could not find backward character model: charmodel_forwardF)finetunecharmodel_backwardupos_emb_dim)	embed_dimr   batch_firstz)Using attention mechanism with embed dim z and z attention heads.)r(   bidirectionalzUsing LSTM mechanism.   @   )5superr   __init__
model_argsr   
input_sizer   embadd_unsaved_modulenn	Embeddingfrom_pretrained	enumeratevocab	vocab_mapshape
vocab_sizer    known_wordsknown_word_maplendelta_embeddinginitnormal_weight
use_charlmospathexistsFileNotFoundErrorr   loadr#   r%   r&   
upos_to_idupos_embnext
parametersdevicer
   round_up_to_multipleMultiheadAttentiontomultihead_attnloggerdebugLSTMlstm
SequentialLinearReLUmlp)selfr.   
output_dimpt_embeddinglabel_decoderrG   r:   target_wordstarget_uposrA   charlm_forward_filecharlm_backward_file
emb_matrixrK   mlp_input_size	__class__r   r   r-      sh   




zLemmaClassifierLSTM.__init__c              	      sf       j  j j j jt jt j	d} fdd|d 
 D }|D ]}|d |= q)|S )N)paramsr[   
model_typeargsrG   r:   r\   r]   c                    s   g | ]	}  |r|qS r   )is_unsaved_module)r   krX   r   r   
<listcomp>x   s    z5LemmaClassifierLSTM.get_save_dict.<locals>.<listcomp>rd   )
state_dictr[   re   namer.   rG   r:   listr\   r]   keys)rX   	save_dictskippedrh   r   ri   r   get_save_dictm   s   
z!LemmaClassifierLSTM.get_save_dict	upos_tagsc                    s     j d ur fdd|D S d S )Nc                    s   g | ]} fd d|D qS )c                    s   g | ]} j | qS r   rG   )r   xri   r   r   rj      r   z?LemmaClassifierLSTM.convert_tags.<locals>.<listcomp>.<listcomp>r   )r   sentenceri   r   r   rj      r   z4LemmaClassifierLSTM.convert_tags.<locals>.<listcomp>rs   )rX   rr   r   ri   r   convert_tags}   s   
z LemmaClassifierLSTM.convert_tagspos_indices	sentencesc           !         s  t   j}t|}g }g }|D ],} fdd|D }	tj|	|d}	||	  fdd|D }
tj|
|d}
||
 qt|dd}t|dd} | 	| } j
dkrzdd |D }t|ddd	|} |}t||fd
|} jr j|} j|}t|dd}t|dd}t|||fd
} jdkrdd }|jd |jd
 }}||||d}|||7 }t|dd}tdd |D } jdkr'|d|d}}tjtj| j ||tjddd}|| j||}|dddd| j |||} j||||d\}}|t|d|f }n$t||dd} |\}\}}t|dd\}}|t|d|f }  |} | S )a2  
        Computes the forward pass of the neural net

        Args:
            pos_indices (List[int]): A list of the position index of the target token for lemmatization classification in each sentence.
            sentences (List[List[str]]): A list of the token-split sentences of the input data.
            upos_tags (List[List[int]]): A list of the upos tags for each token in every sentence.

        Returns:
            torch.tensor: Output logits of the neural network, where the shape is  (n, output_size) where n is the number of sentences.
        c                    s   g | ]} j | tqS r   )r7   getlowerr	   r   r   ri   r   r   rj      r   z/LemmaClassifierLSTM.forward.<locals>.<listcomp>rK   c                    s   g | ]} j | d qS )r   )r;   ry   rz   r{   ri   r   r   rj      r   T)r(   r   c                 S   s   g | ]}t |qS r   )torchtensor)r   sentence_tagsr   r   r   rj      r   )r(   padding_valuer*   c                 S   s   t j| ||d}t jd| t j|dd}t t d|d td |  |}t 	|| |d d dd df< t 
|| |d d dd df< |d}|S )Nr|   r   )dtyperK   r   r*   g     @)r}   zerosarangefloat	unsqueezeexpmathlogrN   sincos)seq_lend_modelrK   encodingpositiondiv_termr   r   r   positional_encoding   s   .  
z8LemmaClassifierLSTM.forward.<locals>.positional_encodingr   c                 S   s   g | ]}t |qS r   )r<   )r   seqr   r   r   rj      s    )r   )diagonal)	attn_mask)!rI   rJ   rK   r<   r}   r~   appendr   r   r=   r&   rN   rH   catrA   r#   build_char_representationr%   r   r8   	expand_assizetriuonesboolviewrepeatrO   r   r   rS   r   rW   )!rX   rw   rx   rr   rK   
batch_size	token_idsdelta_token_idswordssentence_token_idssentence_delta_token_idsembeddedpos_embchar_reps_forwardchar_reps_backwardr   r   r   pos_encpadded_sequenceslengthstarget_seq_lengthsrc_seq_lengthr   attn_outputattn_weights
token_repspacked_sequenceslstm_outhidden_unpacked_lstm_outputsoutputr   ri   r   forward   sZ   



$&
zLemmaClassifierLSTM.forwardc                 C   s   t jS )N)r   rR   ri   r   r   r   re      s   zLemmaClassifierLSTM.model_type)FNN)__name__
__module____qualname____doc__r-   rq   r   strrv   intr   re   __classcell__r   r   rb   r   r      s    U*Xr   )r}   torch.nnr2   rB   loggingr   torch.nn.utils.rnnr   r   r   stanza.models.common.char_modelr   r   typingr   r   stanza.models.common.vocabr	   stanza.models.lemma_classifierr
   )stanza.models.lemma_classifier.base_modelr   (stanza.models.lemma_classifier.constantsr   	getLoggerrP   r   r   r   r   r   <module>   s    
