o
    h;&                     @   s   d dl Z d dlZd dlZd dlmZmZ d dlmZmZm	Z	m
Z
 d dlmZmZmZmZmZ d dlmZmZmZmZ d dlmZ d dlT edZd	d
 ZG dd dZdddZdS )    N)filter_dataneeds_length_filter)
map_to_idsget_long_tensorget_float_tensorsort_all)PAD_IDVOCAB_PREFIXROOT_IDCompositeVocab	CharVocab)	WordVocab	XPOSVocabFeatureVocab
MultiVocab)xpos_vocab_factory)*stanzac           
      C   s  g }|st | dd t dkd} d}n|r%t| gdd | D \\} }nd}g }d}| D ]F}	|durOt|	d |krO|dkrH|| g }d}||	g q-t|	d | |krf|dkrf|| g }d}||	 |t|	d 7 }q-|dkr}|| ||fS )	ai  
    Given a list of lists, where the first element of each sublist
    represents the sentence, group the sentences into batches.

    During training mode (not eval_mode) the sentences are sorted by
    length with a bit of random shuffling.  During eval mode, the
    sentences are sorted by length if sort_during_eval is true.

    Refactored from the data structure in case other models could use
    it and for ease of testing.

    Returns (batches, original_order), where original_order is None
    when in train mode or when unsorted and represents the original
    location of each sentence in the sort
    c                 S   s   t | d S Nr   len)x r   V/var/www/html/env_mimamsha/lib/python3.10/site-packages/stanza/models/depparse/data.py<lambda>"   s    z!data_to_batches.<locals>.<lambda>g      ?)keyreverseNc                 S   s   g | ]}t |d  qS r   r   .0r   r   r   r   
<listcomp>%       z#data_to_batches.<locals>.<listcomp>r   )sortedrandomr   r   append)
data
batch_size	eval_modesort_during_evalmin_length_to_batch_separatelyresdata_orig_idxcurrent
currentlenr   r   r   r   data_to_batches   s2   



r.   c                   @   sf   e Zd ZdddZdd Zdd Zd	d
 Zdd Zdd Zdd Z	dd Z
dd Zdd Zdd ZdS )
DataLoaderNFc
                 C   s>  || _ || _|| _|| _| j | _|| _|| _| |}
|d u r'| |
| _	n|| _	| j
dd rAt| jd rAt| jd |
|	}
d | _|d urP|d rP|j	| _|
dddk ru| jsut|d t|
 }t|
|}
td|d  | |
| j	| j|}
| jrt|
 t|
| _| |
| _tdt| j d S )N
bert_modelpretrainsample_traing      ?z%Subsample training set with rate {:g}z{} batches created.)r&   r)   argsevalshuffledr(   docload_doc
init_vocabvocabgetr   r   pretrain_vocabintr   r#   sampleloggerdebugformat
preprocessshufflenum_exampleschunk_batchesr%   )selfr6   r&   r3   r1   r9   
evaluationr(   r)   bert_tokenizerr%   keepr   r   r   __init__B   s4   



zDataLoader.__init__c           
   	   C   s   | j dksJ t|| jd }t|| jd ddd}t|| jd dd}t|| jd }t|| jd dd}t|| jd dd	dd
}t|| jd dd}t|||||||d}	|	S )NF	shorthand   T)cutofflower   )idx      )rL   rO   rM      )charworduposxposfeatslemmadeprel)r4   r   r3   r   r   r   r   )
rE   r%   	charvocab	wordvocab	uposvocab	xposvocab
featsvocab
lemmavocabdeprelvocabr9   r   r   r   r8   k   s"   zDataLoader.init_vocabc           
         s  g }t d trtgtd  gntg}tgtd  g}|D ]}tgd dd |D  g}	|	tggfdd|D  g7 }	|	tgd dd |D  g7 }	|	|d d	d |D  g7 }	|	|d d
d |D  g7 }	|d ur|	tg|dd |D  g7 }	n|	tgtgt|  g7 }	|	tgd dd |D  g7 }	|	 fdd|D g7 }	|	d dd |D g7 }	|	dd |D  ||	 q"|S )NrV   rW   rT   c                 S      g | ]}|d  qS r   r   r   wr   r   r   r           z)DataLoader.preprocess.<locals>.<listcomp>c                    s(   g | ]} d   dd |d D qS )rS   c                 S   s   g | ]}|qS r   r   r   r   r   r   r       s    z4DataLoader.preprocess.<locals>.<listcomp>.<listcomp>r   )maprb   )r9   r   r   r       s   ( rU   c                 S   ra   )rN   r   rb   r   r   r   r       rd   c                 S   ra   )   r   rb   r   r   r   r       rd   c                 S   ra   )rP   r   rb   r   r   r   r       rd   c                 S   s   g | ]}|d    qS r   )rM   rb   r   r   r   r       r!   rX   c                 S   ra   )rQ   r   rb   r   r   r   r       rd   c                    s   g | ]}t |d   jdqS )   )ignore_error)to_intr4   rb   rE   r   r   r       s    rY   c                 S   ra   )rR   r   rb   r   r   r   r       rd   c                 S   ra   r   r   rb   r   r   r   r       rd   )
isinstancer   r
   r   re   r   r$   )
rE   r%   r9   r;   r3   	processedxpos_replacementfeats_replacementsentprocessed_sentr   )rE   r9   r   rA   }   s$   (  $"""$zDataLoader.preprocessc                 C   s
   t | jS N)r   r%   rj   r   r   r   __len__      
zDataLoader.__len__c                 C   s  t |tst|dk s|t| jkrt| j| }t|}tt| }t|dks+J dd |d D }t||\}}dd |d D }dd |D }t|g|\}}|d }dd |D }|d }	t	|	|}	t
|	t}
t	|t|}t
|t}t	|d	 |}t	|d
 |}t	|d |}t	|d |}dd |d D }t	|d |}t	|d |}t	|d |}|d }|	|
||||||||||||||fS )z Get a batch with index. r   
   c                 S      g | ]}t |qS r   r   r   r   r   r   r       rd   z*DataLoader.__getitem__.<locals>.<listcomp>c                 S      g | ]	}|D ]}|qqS r   r   )r   ro   rc   r   r   r   r           rN   c                 S   ru   r   r   r   r   r   r   r       rd   c                 S   ru   r   r   r   r   r   r   r       rd   rf   rP   rQ   rg   c                 S   ru   r   r   r   r   r   r   r       rd   rR   rK      	   )rk   r<   	TypeErrorr   r%   
IndexErrorlistzipr   r   torcheqr   )rE   r   batchr&   lensorig_idxbatch_words	word_lensword_orig_idxwords
words_mask	wordcharswordchars_maskrU   rV   ufeats
pretrainedsentlensrX   headrY   textr   r   r   __getitem__   s<   


$zDataLoader.__getitem__c                 C   s*   |j tttttttgdd}| |}|S )NT)as_sentences)	r:   TEXTUPOSXPOSFEATSLEMMAHEADDEPRELresolve_none)rE   r6   r%   r   r   r   r7      s   
zDataLoader.load_docc                 C   sj   t t|D ],}t t|| D ]!}t t|| | D ]}|| | | d u r0d|| | |< qqq|S )N_)ranger   )rE   r%   sent_idxtok_idxfeat_idxr   r   r   r      s   zDataLoader.resolve_nonec                 c   s$    t |  D ]}| |V  qd S rq   )r   rr   r   )rE   ir   r   r   __iter__   s   zDataLoader.__iter__c                 C   s
   || _ d S rq   )r&   )rE   r&   r   r   r   set_batch_size   rs   zDataLoader.set_batch_sizec                 C   s,   dd | j D }| || _ t| j  d S )Nc                 S   rv   r   r   )r   r   yr   r   r   r       rw   z(DataLoader.reshuffle.<locals>.<listcomp>)r%   rD   r#   rB   )rE   r%   r   r   r   	reshuffle   s   zDataLoader.reshufflec                 C   s(   t || j| j| j| jd\}}|| _|S )N)r%   r&   r'   r(   r)   )r.   r&   r4   r(   r)   r+   )rE   r%   batchesr+   r   r   r   rD      s   
zDataLoader.chunk_batches)NFFNN)__name__
__module____qualname__rI   r8   rA   rr   r   r7   r   r   r   r   rD   r   r   r   r   r/   @   s    
)(	r/   Fc              
   C   s>   zt | }W |S  ty } z|rW Y d }~dS |d }~ww r   )r<   
ValueError)stringrh   r*   errr   r   r   ri      s   
ri   )F)r#   loggingr~   #stanza.models.common.bert_embeddingr   r   stanza.models.common.datar   r   r   r   stanza.models.common.vocabr   r	   r
   r   r   stanza.models.pos.vocabr   r   r   r   $stanza.models.pos.xpos_vocab_factoryr   stanza.models.common.doc	getLoggerr>   r.   r/   ri   r   r   r   r   <module>   s    
2 !