o
    h_?                     @   s  d dl Z d dlZd dlZd dlZd dlmZ d dlmZ d dl	m
Z
 d dlmZ d dlmZmZ d dlmZmZmZmZ d dlmZmZmZ d d	lmZmZmZmZ d d
lmZ d dl T e!dZ"eddZ#eddZ$G dd dZ%G dd de
Z&G dd dZ'dS )    N)
namedtuple)
DataLoader)Sampler)pad_sequence)filter_dataneeds_length_filter)
map_to_idsget_long_tensorget_float_tensorsort_all)PAD_IDVOCAB_PREFIX	CharVocab)	WordVocab	XPOSVocabFeatureVocab
MultiVocab)xpos_vocab_factory)*stanza
DataSamplez'word char upos xpos feats pretrain text	DataBatchztwords words_mask wordchars wordchars_mask upos xpos ufeats pretrained orig_idx word_orig_idx lens word_lens text idxc                   @   s~   e Zd ZdddZedd Zdd Zd	d
 Zdd Zdd Z	dd Z
dd Zdd Zedd Zedd Zedd ZdS )DatasetNFc                 K   s  || _ || _| j | _|| _|| _|d u rt|g|| _n|| _tdd |j	t
ddD  | _tdd |j	tddD  | _tdd |j	tddD  | _| | j}	| j 	dd rnt| j d rnt| j d |	|}	d | _|d ur}|d r}|j| _|	d	d
d
k r| jst|d	 t|	 }
t|	|
}	td|d	  | |	| j| j|}	|	| _t|	| _| jd dg| _| j 	dd| _ d S )Nc                 s        | ]}|d u p|dkV  qd S N_ .0xr   r   Q/var/www/html/env_mimamsha/lib/python3.10/site-packages/stanza/models/pos/data.py	<genexpr>$       z#Dataset.__init__.<locals>.<genexpr>Fas_sentencesc                 s   r   r   r   r   r   r   r    r!   %   r"   c                 s   r   r   r   r   r   r   r    r!   &   r"   
bert_modelpretrainsample_traing      ?z%Subsample training set with rate {:g}uposPUNCTaugment_nopunctg        )!argsevalshuffledsort_during_evaldocr   
init_vocabvocaballgetUPOShas_uposXPOShas_xposFEATS	has_featsload_docr   r   pretrain_vocabintlenrandomsampleloggerdebugformat
preprocessdatanum_examplesmap_Dataset__punct_tagsr*   )selfr/   r+   r&   r1   
evaluationr.   bert_tokenizerkwargsrD   keepr   r   r    __init__   s4   
   
zDataset.__init__c           
   
   C   s   dd | D }t ||d }t||d |d dd}t||d dd}t||d }zt||d d	d}W n tyF } ztd
|d }~ww t|||||d}	|	S )Nc                 S   s    g | ]}t |D ]}|q	qS r   )r   r:   )r   r/   r   r   r   r    
<listcomp>B   s     z&Dataset.init_vocab.<locals>.<listcomp>	shorthandword_cutoffT)cutofflower   )idx   zUnable to build features vocab.  Please check the Features column of your data for an error which may match the following description.)charwordr(   xposfeats)r   r   r   r   
ValueErrorr   )
docsr+   rD   	charvocab	wordvocab	uposvocab	xposvocab
featsvocaber1   r   r   r    r0   @   s$   
zDataset.init_vocabc                    s   g }|D ]`}t  d dd |D g fdd|D g d dd |D g d dd |D g d	 d
d |D g|d urO|dd |D gntgt| gdd |D d}|| q|S )NrW   c                 S      g | ]}|d  qS r   r   r   wr   r   r    rN   V       z&Dataset.preprocess.<locals>.<listcomp>c                    s(   g | ]} d   dd |d D qS )rV   c                 S   s   g | ]}|qS r   r   r   r   r   r    rN   W   s    z1Dataset.preprocess.<locals>.<listcomp>.<listcomp>r   )rF   rd   r1   r   r    rN   W   s   ( r(   c                 S   rb   )rS   r   rd   r   r   r    rN   X   rf   rX   c                 S   rb   )   r   rd   r   r   r    rN   Y   rf   rY   c                 S   rb   )rU   r   rd   r   r   r    rN   Z   rf   c                 S   s   g | ]}|d    qS rc   )rR   rd   r   r   r    rN   [   s    c                 S   rb   rc   r   rd   r   r   r    rN   ^   rf   )rW   rV   r(   rX   rY   r&   text)r   rF   r   r=   append)rH   rD   r1   r;   r+   	processedsentprocessed_sentr   rg   r    rC   R   s   zDataset.preprocessc                 C   
   t | jS N)r=   rD   rH   r   r   r    __len__d      
zDataset.__len__c                 C   st   t j|t jd}tdd| jk r8| jD ]#}t j|t jd}d|d< t |t 	|ggs7|||k|@ O }q|S )zAReturns a torch boolean about which elements should be masked out)dtyper   rS   T).)
torch
zeros_likeboolr>   uniformr*   rG   r2   eqtensor)rH   r(   maskilast_elementr   r   r    __maskg   s   
zDataset.__maskc                 C   sf  | j | }t|jd }| jrt|jd nd}| jr%t|jd nd}| jr2t|j	d nd}t|j
d }|jd }|j}	| jrT|durT| jsT| |}
nd}
|
dur|
 }|D ]F}
|
 }
t||
< |durrt||
< |dur|t||
df< |durt||
df< t||
< |d|
 ||
d d  }|	d|
 |	|
d d  }	q`t|||||||	|fS )a  Retrieves a sample from the dataset.

        Retrieves a sample from the dataset. This function, for the
        most part, is spent performing ad-hoc data augmentation and
        restoration. It recieves a DataSample object from the storage,
        and returns an almost-identical DataSample object that may
        have been augmented with /possibly/ (depending on augment_punct
        settings) PUNCT chopped.

        **Important Note**
        ------------------
        If you would like to load the data into a model, please convert
        this Dataset object into a DataLoader via self.to_loader(). Then,
        you can use the resulting object like any other PyTorch data
        loader. As masks are calculated ad-hoc given the batch, the samples
        returned from this object doesn't have the appropriate masking.

        Motivation
        ----------
        Why is this here? Every time you call next(iter(dataloader)), it calls
        this function. Therefore, if we augmented each sample on each iteration,
        the model will see dynamically generated augmentation.
        Furthermore, PyTorch dataloader handles shuffling natively.

        Parameters
        ----------
        key : int
            the integer ID to from which to retrieve the key.

        Returns
        -------
        DataSample
            The sample of data you requested, with augmentation.
        r   N.rS   )rD   ru   rz   rW   r5   r(   r7   rX   r9   rY   r&   rV   ri   r,   _Dataset__masknonzeroitemr   r   )rH   keyr?   wordsr(   rX   ufeats
pretrainedrV   raw_textr{   
mask_indexr   r   r    __getitem__   s4   
$
zDataset.__getitem__c                 c   s$    t |  D ]}| |V  qd S ro   )rangerq   r   )rH   r|   r   r   r    __iter__   s   zDataset.__iter__c                 K   s   t | fdtji|S )zConverts self to a DataLoader 
collate_fn)DLr   _Dataset__collate_fn)rH   rK   r   r   r    	to_loader   s
   zDataset.to_loaderc                 C   s   t | ||}t| tj|dS )N)r   batch_sampler)LengthLimitedBatchSamplerr   r   r   )rH   
batch_sizemaximum_tokenssamplerr   r   r    to_length_limited_loader   s
   z Dataset.to_length_limited_loaderc                 C   sZ  t |  \} }t |  \}}}}}}}t| }	dd |D }
t|||||||f|
\\}}}}}}}}dd |D }
dd |D }dd |D }t|g|\\}}dd |D }t|dt}d|vrgt|dt}nd}d|vrtt|dt}nd}d|vrt|dt}nd}t|dt}t|t|}t|t}t|t}t|||||||||||
|||S )	z(Function used by DataLoader to pack datac                 S      g | ]	}t |tkqS r   ru   sumr   r   r   r   r    rN          z(Dataset.__collate_fn.<locals>.<listcomp>c                 S   r   r   r   r   r   r   r    rN      r   c                 S      g | ]	}|D ]}|qqS r   r   )r   rl   re   r   r   r    rN      r   c                 S      g | ]}t |qS r   r=   r   r   r   r    rN      rf   c                 S   r   r   r   r   r   r   r    rN      rf   TN)	zipr=   r   r   r   r	   ru   ry   r   )rD   rT   r   	wordcharsr(   rX   r   r   ri   r   lensorig_idx	word_lensword_orig_idx
words_maskwordchars_maskr   r   r    __collate_fn   sB   
zDataset.__collate_fnc                 C   s$   | j ttttgdd}t|}|S )NTr#   )r3   TEXTr4   r6   r8   r   resolve_none)r/   rD   r   r   r    r:     s   
zDataset.load_docc                 C   sj   t t| D ],}t t| | D ]!}t t| | | D ]}| | | | d u r0d| | | |< qqq| S r   )r   r=   )rD   sent_idxtok_idxfeat_idxr   r   r    r     s   zDataset.resolve_none)NFFN)__name__
__module____qualname__rM   staticmethodr0   rC   rq   r   r   r   r   r   r   r:   r   r   r   r   r    r      s"    
(
W
-
r   c                   @   (   e Zd ZdZdd Zdd Zdd ZdS )	r   ad  
    Batches up the text in batches of batch_size, but cuts off each time a batch reaches maximum_tokens

    Intent is to avoid GPU OOM in situations where one sentence is significantly longer than expected,
    leaving a batch too large to fit in the GPU

    Sentences which are longer than maximum_tokens by themselves are put in their own batches
    c           	      C   s   || _ || _|| _g | _g }d}|D ]J\}}t|j}|r9||kr9t|dkr1| j| g }d}| j|g qt|d |ksI|rS|| |krS| j| g }d}|| ||7 }qt|dkrk| j| dS dS )zm
        Precalculate the batches, making it so len and iter just read off the precalculated batches
        r   rS   N)rD   r   r   batchesr=   rW   rj   )	rH   rD   r   r   current_batchcurrent_lengthr   item_idxitem_lenr   r   r    rM   .  s0   
 

z"LengthLimitedBatchSampler.__init__c                 C   rn   ro   )r=   r   rp   r   r   r    rq   M  rr   z!LengthLimitedBatchSampler.__len__c                 c   s0    | j D ]}g }|D ]}|| q
|V  qd S ro   )r   rj   )rH   batchr   rT   r   r   r    r   P  s   
z"LengthLimitedBatchSampler.__iter__N)r   r   r   __doc__rM   rq   r   r   r   r   r    r   %  s
    r   c                   @   r   )	ShuffledDataseta(  A wrapper around one or more datasets which shuffles the data in batch_size chunks

    This means that if multiple datasets are passed in, the batches
    from each dataset are shuffled together, with one batch being
    entirely members of the same dataset.

    The main use case of this is that in the tagger, there are cases
    where batches from different datasets will have different
    properties, such as having or not having UPOS tags.  We found that
    it is actually somewhat tricky to make the model's loss function
    (in model.py) properly represent batches with mixed w/ and w/o
    property, whereas keeping one entire batch together makes it a lot
    easier to process.

    The mechanism for the shuffling is that the iterator first makes a
    list long enough to represent each batch from each dataset,
    tracking the index of the dataset it is coming from, then shuffles
    that list.  Another alternative would be to use a weighted
    randomization approach, but this is very simple and the memory
    requirements are not too onerous.

    Note that the batch indices are wasteful in the case of only one
    underlying dataset, which is actually the most common use case,
    but the overhead is small enough that it probably isn't worth
    special casing the one dataset version.
    c                    s&   | _ | _ fdd jD  _d S )Nc                    s   g | ]
}|j  jd dqS )T)r   shuffle)r   r   r   rp   r   r    rN   v  s    z,ShuffledDataset.__init__.<locals>.<listcomp>)r   datasetsloaders)rH   r   r   r   rp   r    rM   s  s   zShuffledDataset.__init__c                 c   sh    dd | j D }dd | j D }dd t|D }dd |D }t| |D ]	}t|| V  q(d S )Nc                 S   r   r   )iterr   r   r   r    rN   y  rf   z,ShuffledDataset.__iter__.<locals>.<listcomp>c                 S   r   r   r   r   r   r   r    rN   z  rf   c                 S   s   g | ]	\}}|g| qS r   r   )r   r   yr   r   r    rN   {  r   c                 S   r   r   r   )r   innerrT   r   r   r    rN   |  r   )r   	enumerater>   r   next)rH   	iteratorslengthsindicesrT   r   r   r    r   x  s   
zShuffledDataset.__iter__c                 C   s   t dd | jD S )Nc                 s   s    | ]}t |V  qd S ro   r   r   r   r   r    r!     s    z*ShuffledDataset.__len__.<locals>.<genexpr>)r   r   rp   r   r   r    rq     s   zShuffledDataset.__len__N)r   r   r   r   rM   r   rq   r   r   r   r    r   X  s
    
r   )(r>   loggingcopyru   collectionsr   torch.utils.datar   r   torch.utils.data.samplerr   torch.nn.utils.rnnr   #stanza.models.common.bert_embeddingr   r   stanza.models.common.datar   r	   r
   r   stanza.models.common.vocabr   r   r   stanza.models.pos.vocabr   r   r   r   $stanza.models.pos.xpos_vocab_factoryr   stanza.models.common.doc	getLoggerr@   r   r   r   r   r   r   r   r   r    <module>   s,    


  3