o
    h&                     @   s   d dl Z d dlZd dlZd dlmZmZ d dlmZmZm	Z	 d dl
mZmZ d dlmZmZmZ d dlmZ d dlT d dlmZmZ ed	ZG d
d dZdS )    N)filter_dataneeds_length_filter)
map_to_idsget_long_tensorsort_all)PAD_IDVOCAB_PREFIX)	CharVocabCompositeVocab	WordVocab)
MultiVocab)*)process_tagsnormalize_empty_tagsstanzac                   @   s^   e Zd ZdddZdd Zdd	 Zd
d Zdd Zdd Zdd Z	dd Z
dd Zdd ZdS )
DataLoaderNFTc                 C   s6  |
| _ || _|| _|| _| j | _|| _|| _| | j|	}| jdd r5t	| jd r5t
| jd ||}dd |D | _|| _|d u rK| || _n|| _|dddk rs| jsst|d t| }t||}td|d  | || j|}| jrt| t|| _| || _tdt| j d S )N
bert_modelc                 S      g | ]	}d d |D qS )c                 S      g | ]}|d  qS     .0wr   r   Q/var/www/html/env_mimamsha/lib/python3.10/site-packages/stanza/models/ner/data.py
<listcomp>       z2DataLoader.__init__.<locals>.<listcomp>.<listcomp>r   r   sentr   r   r   r          z'DataLoader.__init__.<locals>.<listcomp>sample_traing      ?z%Subsample training set with rate {:g}z{} batches created.)max_batch_words
batch_sizeargsevalshuffleddocpreprocess_tags	_load_docgetr   r   tagspretrain
init_vocabvocabintlenrandomsampleloggerdebugformat
preprocessshufflenum_exampleschunk_batchesdata)selfr'   r#   r$   r,   r.   
evaluationr(   bert_tokenizerschemer"   r:   keepr   r   r   __init__   s2   


zDataLoader.__init__c           	         s  dd }| j rtd| jd rt|| jd }nt|| jd }| jj dd |D }t|| jd d	d d
}d }| jd rd| jd rQt fdd|D }nt fdd|D }t	
dt| t|| jd d| jd |d}t	
dt| t| ||d}|S )Nc                 S   sP   t j| dd dd}d|v r|d S d|v r"d|d v r"|d d S td|  )z+ Try loading vocab from charLM model file. c                 S      | S Nr   )storagelocr   r   r   <lambda>:       z;DataLoader.init_vocab.<locals>.from_model.<locals>.<lambda>T)weights_onlyr.   modelz)Cannot find vocab in charLM model file %s)torchload
ValueError)model_filename
state_dictr   r   r   
from_model8   s   z)DataLoader.init_vocab.<locals>.from_modelz Vocab must exist for evaluation.charlmcharlm_forward_file	shorthandc                 S   r   )c                 S   s   g | ]}|d  fqS r   r   r   xr   r   r   r   H   s    z4DataLoader.init_vocab.<locals>.<listcomp>.<listcomp>r   r   sentencer   r   r   r   H   r    z)DataLoader.init_vocab.<locals>.<listcomp>r   )idxsepemb_finetune_known_only	lowercasec                    s:   g | ]}|D ]}|d   v s|d     v r|d  qqS r   lowerr   r   r   	wordvocabr   r   r   M   s   : c                    s*   g | ]}|D ]}|d   v r|d  qqS rZ   r   r]   r^   r   r   r   O   s   * zOIgnoring %d in the delta vocab as they did not appear in the original embeddingr   )cutoffr\   ignorezCreating delta vocab of size %s)charworddeltatag)r%   AssertionErrorr$   r	   load_state_dictr,   r.   r
   setr3   r4   r0   r   r   )	r;   r:   rN   	charvocabtag_datatagvocabra   
deltavocabr.   r   r^   r   r-   7   s.   	


zDataLoader.init_vocabc                    s   g }| ddrdd  ndd  t|D ]-\}}dd |D g}| fdd|D g7 }|d	 d
d |D g7 }|| q|S )Nchar_lowercaseFc                 S   s   |   S rB   r[   rS   r   r   r   rE   \   s    z'DataLoader.preprocess.<locals>.<lambda>c                 S   rA   rB   r   rn   r   r   r   rE   ^   rF   c                 S   r   rZ   r   r   r   r   r   r   `   r   z)DataLoader.preprocess.<locals>.<listcomp>c                    s,   g | ]}d    fdd|d D qS )rb   c                    s   g | ]} |qS r   r   rR   )	char_caser   r   r   a   r   z4DataLoader.preprocess.<locals>.<listcomp>.<listcomp>r   )mapr   ro   r.   r   r   r   a   s   , re   c                 S   r   r   r   r   r   r   r   r   b   r   )r*   	enumeraterp   append)r;   r:   r.   r$   	processedsent_idxr   processed_sentr   rq   r   r6   Y   s   
zDataLoader.preprocessc                 C   s
   t | jS rB   )r0   r:   )r;   r   r   r   __len__f   s   
zDataLoader.__len__c                 C   s  t |tst|dk s|t| jkrt| j| }t|}tt| }t|dks+J dd |d D }t||\}}dd |d D }| 	|d \}}}}	}
t||||	g|
\}}|\}}}}	dd |D }
dd |d D }d	d |D }t|g|\}}|d }d
d |D }|d }t
|t|}t|t}t
||| jd dd}t
||| jd dd}t|d|dg}||	g}t
|d |}|||||||||||
|fS )z Get a batch with index. r      c                 S      g | ]}t |qS r   r0   rR   r   r   r   r   u   r   z*DataLoader.__getitem__.<locals>.<listcomp>c                 S   ry   r   rz   rR   r   r   r   r   w   r   r   c                 S   ry   r   rz   r   r   r   r   r   }   r   c                 S      g | ]	}|D ]}|qqS r   r   r]   r   r   r   r      r    c                 S   ry   r   rz   rR   r   r   r   r      r   c                 S   ry   r   rz   rR   r   r   r   r      r   rb    )pad_id   )
isinstancer/   	TypeErrorr0   r:   
IndexErrorlistzipr   process_charsr   rI   eqr   r.   unit2idcat	unsqueeze)r;   keybatchr#   sentlensorig_idxchars_forwardchars_backwardcharoffsets_forwardcharoffsets_backwardcharlenschars_sortedchar_orig_idxbatch_wordswordlensword_orig_idxwords	wordcharswordchars_maskcharscharoffsetsr+   r   r   r   __getitem__i   s:   

zDataLoader.__getitem__c                 c   s$    t |  D ]}| |V  qd S rB   )rangerw   r   )r;   ir   r   r   __iter__   s   zDataLoader.__iter__c                 C   sR   |j tttgddd}dd |D }| jr'|d u r#t|| j dd}t|}|S )NT)as_sentences
from_tokenc                 S   r   )c                 S   s6   g | ]}|d  r|d |d  gn|d |d fgqS )r~   r   r   r   )r   tokenr   r   r   r      s   6 z3DataLoader._load_doc.<locals>.<listcomp>.<listcomp>r   rT   r   r   r   r      r    z(DataLoader._load_doc.<locals>.<listcomp>r>   bio)r*   TEXTNER	MULTI_NERr(   r   r$   r   )r;   r'   r>   r:   r   r   r   r)      s   zDataLoader._load_docc                 C   s  | j d d| j d d}}d\}}g g g g f\}}}}	|D ]V}
|g|gg g f\}}}}|
D ]}||7 }|t|g }||g7 }q1|
d d d D ]}||d d d 7 }t|g| }||g7 }qK|| || || |	| q!dd |D }||||	|fS )Nrb   
r|   )r   r   c                 S   ry   r   rz   r   r   r   r   r      r   z,DataLoader.process_chars.<locals>.<listcomp>)r.   r   r0   rs   )r;   sentsstart_idend_idstart_offset
end_offsetr   r   r   r   r   chars_forward_sentchars_backward_sentcharoffsets_forward_sentcharoffsets_backward_sentrc   r   r   r   r   r      s&   "


zDataLoader.process_charsc                 C   s*   dd | j D }t| | || _ d S )Nc                 S   r{   r   r   )r   rS   yr   r   r   r      r    z(DataLoader.reshuffle.<locals>.<listcomp>)r:   r1   r7   r9   )r;   r:   r   r   r   	reshuffle   s   
zDataLoader.reshufflec                    s   j d u r fddtdt jD S g }g } D ](}|| t|jkr1|| g }tdd |D j krD|| g }qt|dkrP|| |S )Nc                    s   g | ]} ||j   qS r   )r#   )r   r   r:   r;   r   r   r      s    z,DataLoader.chunk_batches.<locals>.<listcomp>r   c                 s   s    | ]	}t |d  V  qdS )r   Nrz   rR   r   r   r   	<genexpr>   s    z+DataLoader.chunk_batches.<locals>.<genexpr>)r"   r   r0   r#   rs   sum)r;   r:   batches
next_batchitemr   r   r   r9      s    
"



zDataLoader.chunk_batches)NNFTNNN)__name__
__module____qualname__r@   r-   r6   rw   r   r   r)   r   r   r9   r   r   r   r   r      s    
'")r   )r1   loggingrI   #stanza.models.common.bert_embeddingr   r   stanza.models.common.datar   r   r   stanza.models.common.vocabr   r   stanza.models.pos.vocabr	   r
   r   stanza.models.ner.vocabr   stanza.models.common.docstanza.models.ner.utilsr   r   	getLoggerr3   r   r   r   r   r   <module>   s    
