o
    h_                     @   s  d dl Z d dlZd dlZd dlZd dlmZ d dlm  mZ	 d dl
mZmZmZmZ edZddiddidZG dd deZd	d
 Zd)ddZd)ddZdd ZdejfddZdd Zd*ddZd*ddZd*ddZdZdd Zd*dd Z d!d" Z!d#d$ Z"d%d& Z#d+d'd(Z$dS ),    N)pad_packed_sequencepack_padded_sequencepack_sequencePackedSequencestanzause_fastT)zvinai/phobert-basezvinai/phobert-largec                       s    e Zd ZdZ fddZ  ZS )TextTooLongErrorzF
    A text was too long for the underlying model (possibly BERT)
    c                    s&   t  d|||f  || _|| _d S )NzoFound a text of length %d (possibly after tokenizing).  Maximum handled length is %d  Error occurred at line %d)super__init__line_numtext)selflengthmax_lenr   r   	__class__ ^/var/www/html/env_mimamsha/lib/python3.10/site-packages/stanza/models/common/bert_embedding.pyr
      s   
zTextTooLongError.__init__)__name__
__module____qualname____doc__r
   __classcell__r   r   r   r   r      s    r   c                 C   s   | dv r	d|_ d S d S )N)zhf-internal-testing/tiny-bertgoogle/muril-base-casedzgoogle/muril-large-casedz-airesearch/wangchanberta-base-att-spm-uncasedzcamembert/camembert-largez,hfl/chinese-electra-180g-large-discriminatorz*NYTK/electra-small-discriminator-hungariani   )model_max_length)
model_name	tokenizerr   r   r   update_max_length   s   
r   Fc                 C   s   | rAzddl m} W n ty   tdw t| t }| ds&d|d< |r-|| ||d< |j| fi |}t	| | |S d S )Nr   )AutoTokenizerUPlease install transformers library for BERT support! Try `pip install transformers`.vinai/phobertTadd_prefix_spacelocal_files_only)
transformersr   ImportError	BERT_ARGSgetdict
startswithupdatefrom_pretrainedr   )r   tokenizer_kwargsr"   r   	bert_argsbert_tokenizerr   r   r   load_tokenizer$   s    


r.   c                 C   sT   | r(zddl m} W n ty   tdw |j| |d}t| ||d}||fS dS )Nr   )	AutoModelr   )r"   )r+   r"   )NN)r#   r/   r$   r*   r.   )r   r+   r"   r/   
bert_modelr-   r   r   r   	load_bert6   s   r1   c                 C   s`   |  drdd |D ndd |D }d|}||}||}|jg| |jg }||fS )zV
    Tokenize a sentence manually, using for checking long sentences and PHOBert.
    r    c                 S   s    g | ]}| d d ddqS )    _ replace.0wordr   r   r   
<listcomp>G        z#tokenize_manual.<locals>.<listcomp>c                 S      g | ]}| d dqS )r2   r4   r5   r7   r   r   r   r:   G       r4   )r(   jointokenizeconvert_tokens_to_idsbos_token_ideos_token_id)r   sentr   	tokenizedsentencesent_idstokenized_sentr   r   r   tokenize_manualB   s   &


rH   c           	      C   s~   |du rt | }g }|D ] }dd |D }t| ||\}}t||jd kr'q|| qt|dt|t| t| |S )zI
    Filter out the (NER, POS) data that is too long for BERT model.
    Nc                 S   s"   g | ]}t |tr|n|d  qS )r   )
isinstancestrr7   r   r   r   r:   `   s   " zfilter_data.<locals>.<listcomp>   zWEliminated %d of %d datapoints because their length is over maximum size of BERT model.)r.   rH   lenr   appendloggerlog)	r   datar   	log_levelfiltered_datarC   rE   r3   rG   r   r   r   filter_dataW   s    rS   c                 C   s&   d| v sd| v r
dS |  drdS dS )zw
    TODO: we were lazy and didn't implement any form of length fudging for models other than bert/roberta/electra
    bartxlnetTr    F)r(   )r   r   r   r   needs_length_filterl   s
   
rV   c                 C   sX   |du rt j| dd ddjddd } nt j| | d dd} |r*|   S | S )z
    Clone & detach the feature, keeping the last N layers (or averaging -2,-3,-4 if not specified)

    averaging 3 of the last 4 layers worked well for non-VI languages
    N   axis   )torchstacksumclonedetach)feature
num_layersra   r   r   r   cloned_featurev   s   
$rd   c              	   C   s<  g }dd |D }	||	dddd}
|
d  |}|
d  |}tttt|	d D ]o}|d }t|d t|	}||| }||| }|rlt  |||dd	}t	|j
||}W d
   n1 sfw   Y  n|||dd	}t	|j
||}t||D ]\}}|d
t|d  }|s|dd }|| qq,|S )zu
    Handles vi-bart.  May need testing before using on other bart

    https://github.com/VinAIResearch/BARTpho
    c                 S   s    g | ]}d  dd |D qS )r4   c                 S   r<   )r4   r3   r5   r7   r   r   r   r:      r=   z;extract_bart_word_embeddings.<locals>.<listcomp>.<listcomp>)r>   r8   rE   r   r   r   r:      r;   z0extract_bart_word_embeddings.<locals>.<listcomp>ptT)return_tensorspaddingreturn_attention_mask	input_idsattention_mask   rk   output_hidden_statesNrK      rX   )torangeintmathceilrL   minr]   no_gradrd   decoder_hidden_statesziprM   )r   r   modelrP   devicekeep_endpointsrc   ra   	processed	sentencesrD   rj   rk   istart_sentenceend_sentencefeaturesrb   rE   r   r   r   extract_bart_word_embeddings   s2   
r   c              	      s>  g }g }	g  t |D ]E\}
}t| ||\}} | t||jkr>td|jt|||
  tt||j|
d||
 |	t	
|  g }|| q
t|	}t	jjjj|	d|jd}g }ttt|d D ]~}|d| d| d  }|d }||jd  }t	j|| |jd |d}t |	|| D ]\}}d||d	t|f< q|rt	  ||  ||dd
}|t|j||7 }W d	   n1 sw   Y  qm||||dd
}|t|j||7 }qmt||ksJ t|t|ksJ  fddt |D }|rdd |D }dd t||D }|S )a#  
    Extract transformer embeddings using a method specifically for phobert

    Since phobert doesn't have the is_split_into_words / tokenized.word_ids(batch_index=0)
    capability, we instead look for @@ to denote a continued token.
    data: list of list of string (the text tokens)
    %Invalid size, max size: %d, got %d %sr4   T)batch_firstpadding_valuerl   r   ro   rz   Nrm   c                    s,   g | ]\ } fd dt   D qS )c                    s>   g | ]\}}|d kr  |d   dr|d kr|d qS )r   ro   z@@)endswith)r8   idx2r3   )idxlist_tokenizedr   r   r:      s   > z9extract_phobert_embeddings.<locals>.<listcomp>.<listcomp>)	enumerater8   rC   r   )r   r   r:      s     z.extract_phobert_embeddings.<locals>.<listcomp>c                 S   s   g | ]
}d g| dg qS )r   rX   r   )r8   offr   r   r   r:      s    c                 S   s   g | ]\}}|| qS r   r   )r8   rb   offsetr   r   r   r:      r=   )r   rH   rM   rL   r   rN   errorr   r>   r]   tensorra   nnutilsrnnpad_sequencepad_token_idrq   rr   rs   rt   shapezerosrv   r`   rp   rd   hidden_statesrx   )r   r   ry   rP   rz   r{   rc   ra   r|   tokenized_sentsr   rC   rD   rG   processed_sentsizetokenized_sents_paddedr   r~   padded_inputr   r   rk   sent_idxrb   offsetsr   r   r   extract_phobert_embeddings   sN   


r   )zbert-base-german-casedzdbmdz/bert-base-german-casedz!dbmdz/bert-base-italian-xxl-casedzdbmdz/bert-base-italian-casedz2dbmdz/electra-base-italian-xxl-cased-discriminatorzavichr/heBERTzonlplab/alephbert-basez#imvladikon/alephbertgimmel-base-512zcahya/bert-base-indonesian-1.5Gzindolem/indobert-base-uncasedr   zl3cube-pune/marathi-robertac                 C   s>   g }|D ]}| |ddj }dd t||D }|| q|S )a   Patch bert tokenizers with missing characters

    There is an issue that some tokenizers (so far the German ones identified above)
    tokenize soft hyphens or other unknown characters into nothing
    If an entire word is tokenized as a soft hyphen, this means the tokenizer
    simply vaporizes that word.  The result is we're missing an embedding for
    an entire word we wanted to use.

    The solution we take here is to look for any words which get vaporized
    in such a manner, eg `len(token) == 2`, and replace it with a regular "-"

    Actually, recently we have found that even the Bert / Electra tokenizer
    can do this in the case of "words" which are one special character long,
    so the easiest thing to do is just always run this function
    F)is_split_into_wordsc                 S   s$   g | ]\}}t |d kr|ndqS )rK   -rL   )r8   r9   tokenr   r   r   r:      s   $ z$fix_blank_tokens.<locals>.<listcomp>)rj   rx   rM   )r   rP   new_datarE   rD   new_sentencer   r   r   fix_blank_tokens  s   r   c              	      s   |dddd}dd |D }	t t|D ]q}
|j|
d}d|	|
 d< t|D ]\}}|d u r1 n|d |	|
 |d < q'|	|
 d	 d |	|
 d
< tdd |	|
 D ratd||
 ||	|
 |t| jd krt	d jt|||
  t
t| j|
d||
 qg }t ttt|d D ]} fdd|d d| d| d  D }tdd |D }tjt||tj|d}t|D ]!\}
}d||
d t|f< t||k r| jg|t|   q|rt  tj||d}|||dd}|t|j||7 }W d    n	1 sw   Y  qtj||d}|||dd}|t|j||7 }qg }|s<dd |	D }	t||	D ]\}}|| }|| qA|S )NTF)r   return_offsets_mappingri   c                 S   s   g | ]}d gt |d  qS )NrK   r   re   r   r   r   r:   )  s    z,extract_xlnet_embeddings.<locals>.<listcomp>batch_indexr   ro   rX   c                 s       | ]}|d u V  qd S Nr   r8   xr   r   r   	<genexpr>5      z+extract_xlnet_embeddings.<locals>.<genexpr>YOOPS, hit None when preparing to use Bert
data[idx]: {}
offsets: {}
list_offsets[idx]: {}rK   r   r4   rl   c                    s(   g | ]} j g|d d   jg qS )Nr   )rA   rB   r   r   r   r   r:   H  s   ( rj   c                 s   s    | ]}t |V  qd S r   r   r   r   r   r   r   I  r   )dtyperz   r   rm   c                 S      g | ]}|d d qS ro   rX   r   r   r   r   r   r:   c  r=   )rq   rL   word_idsr   any
ValueErrorformatr   rN   r   r   r>   rr   rs   rt   maxr]   r   longextendr   rv   r   rd   r   rx   rM   )r   r   ry   rP   rz   r{   rc   ra   rD   list_offsetsr   r   posr   r   r~   rj   r   rk   	input_row	id_tensorrb   r|   new_sentr   r   r   extract_xlnet_embeddings$  sV   *
r   c                 C   s4  |j d |jkr| ||dd}t|j||}|S g }t|jd |jd }	|j|	 }
|	dk r2td|}|}	 |ddd|jf }|ddd|jf }| ||dd}t|j||}t|d	krn|dd|
dddf }|| |j d |jkr|n|dd|	df }|dd|	df }q7tj	|dd
}|S )a4  
    Extract an embedding from the given transformer for a certain attention mask and tokens range

    In the event that the tokens are longer than the max length
    supported by the model, the range is split up into overlapping
    sections and the overlapping pieces are connected.  No idea if
    this is actually any good, but at least it returns something
    instead of horribly failing

    TODO: at least two upgrades are very relevant
      1) cut off some overlap at the end as well
      2) use this on the phobert, bart, and xln versions as well
    ro   Trm      rK      zReally tiny tokenizer!Nr   rZ   )
r   r   rd   r   r   RuntimeErrorrL   rM   r]   cat)ry   r   attention_tensorr   rc   ra   rz   r   slices	slice_len
prefix_lenremaining_attentionremaining_idsattention_sliceid_slicer   r   r   build_cloned_featuresj  s4   

r   c                 C   st   dgt | d  }t|D ]\}}|du rq|||d < qd|d< |ddd D ]}|dur7|d |d<  |S q(|S )z[
    Convert a transformers-tokenized sentence's offsets to a list of word to position
    NrK   ro   r   r   rX   )rL   r   )rE   r   r   r   r   r   r   r   convert_to_position_list  s   r   c                 C   s  ||ddddd}g }	t t|D ]}
t||
 |j|
d}|	| qtdd |	D rVt||}||ddddd}g }	t t|D ]}
t||
 |j|
d}|	| qCtdd |	D rmtd	||
 ||	|
 |g }t t	t
t|d
 D ]W}tj|d d
| d
| d
  |d}tj|d d
| d
| d
  |d}|rt  |t|||||||7 }W d    n1 sw   Y  q||t|||||||7 }q|g }|sdd |	D }	t||	D ]\}}|| }|| q|S )NlongestTF)rh   r   r   ri   r   c                 s   "    | ]}t d d |D V  qdS )c                 s   r   r   r   r   r   r   r   r     r   4extract_base_embeddings.<locals>.<genexpr>.<genexpr>Nr   r8   converted_offsetsr   r   r   r          z*extract_base_embeddings.<locals>.<genexpr>c                 s   r   )c                 s   r   r   r   r   r   r   r   r     r   r   Nr   r   r   r   r   r     r   r   rl   rk   r   rj   c                 S   r   r   r   r   r   r   r   r:     r=   z+extract_base_embeddings.<locals>.<listcomp>)rq   rL   r   r   rM   r   r   r   r   rr   rs   rt   r]   r   rv   r   rx   )r   r   ry   rP   rz   r{   rc   ra   rD   r   r   r   r   r   r~   r   r   r|   rb   r   r   r   r   extract_base_embeddings  s>   
&&
r   c	           	   	   C   s   |du r|j r|  n	|  || | dr%t| |||||||S d| v r4t| |||||||S t|tr=t	|}d| v rLt
| |||||||S t| |||||||S )z
    Extract transformer embeddings using a generic roberta extraction

    data: list of list of string (the text tokens)
    num_layers: how many to return.  If None, the average of -2, -3, -4 is returned
    Nr    rT   rU   )_hf_peft_config_loadeddisable_adaptersenable_adaptersset_adapterr(   r   r   rI   tuplelistr   r   )	r   r   ry   rP   rz   r{   rc   ra   	peft_namer   r   r   extract_bert_embeddings  s   	


r   )NF)T)NTN)%rs   loggingnumpynpr]   torch.nnr   torch.nn.functional
functionalFtorch.nn.utils.rnnr   r   r   r   	getLoggerrN   r%   r   r   r   r.   r1   rH   DEBUGrS   rV   rd   r   r   BAD_TOKENIZERSr   r   r   r   r   r   r   r   r   r   <module>   s8    








$I
F*4