o
    h                     @   s   d dl Z d dlZd dlZd dlmZmZ d dlZd dlZd dl	m
Z
 d dlmZ d dlm  m  mZ d dlmZmZmZmZ d dlmZ d dlmZ d dlmZ ed	Zed
dZ eddZ!dZ"G dd dZG dd deZ#dS )    N)Counter
namedtuple)pad_sequence)
DataLoader)
map_to_idsget_long_tensorget_float_tensorsort_all)
DeltaVocab)Vocab)Documentstanza
DataSamplezsrc tgt_in tgt_out orig_text	DataBatchz.src src_mask tgt_in tgt_out orig_text orig_idx)"'u   ʼu   ˊu   ՚u   ߴu   ’u   ＇c                   @   sl   e Zd ZdddZdd Zdd Zd	d
 Zdd Zdd Zdd Z	e
dd Zdd Zdd ZdddZdS )r   NFc           
         s:  |_ |_|dd_|_|_jjjd |d u rIjdks'J  _jdkrHt	fddt
D rHt
D ]}j| q?n|rRt |_n|_|dd	d	k rzjszt|d t  }t | td
|d  jsttt }	t|	  fdd|	D   _t _d S )Naugment_aposg        )
evaluationFr   c                 3   s    | ]}| j v V  qd S N)vocab.0xself Q/var/www/html/env_mimamsha/lib/python3.10/site-packages/stanza/models/mwt/data.py	<genexpr>,   s    z&DataLoader.__init__.<locals>.<genexpr>sample_traing      ?z%Subsample training set with rate {:g}c                    s   g | ]} | qS r   r   )r   i)datar   r   
<listcomp>>       z'DataLoader.__init__.<locals>.<listcomp>)
batch_sizeargsgetr   r   docload_doc
init_vocabr   anyAPOSadd_unitr
   intlenrandomsampleloggerdebugformatlistrangeshuffler    num_examples)
r   r&   r#   r$   r   r   expand_unk_vocabaposkeepindicesr   )r    r   r   __init__   s4    
zDataLoader.__init__c                 C   s"   | j dksJ t|| jd }|S )NF	shorthand)r   r   r$   )r   r    r   r   r   r   r(   C   s   zDataLoader.init_vocabc                 C   s\   t D ])}||d v r+tdd| jk r(tt }|d |||d ||f} |S q|S Nr      )r*   r.   uniformr   choicereplace)r   datumoriginalreplacementr   r   r   maybe_augment_aposH   s   
 zDataLoader.maybe_augment_aposc                 C   sl   | j s| jdkr| |}t|d }tjg| tjg }| | j|\}}| j	|}||||d g}|S )Nr   )
r   r   rE   r3   constantSOSEOSprepare_targetr   map)r   r/   srctgt_intgt_out	processedr   r   r   processQ   s   
zDataLoader.processc                 C   sL   | j r
t|d }nt|d }|tjg| }||tjg }||fS r=   )r   r3   rJ   rF   rG   rH   )r   r   rB   tgtrL   rM   r   r   r   rI   [   s   zDataLoader.prepare_targetc                 C   s
   t | jS r   )r-   r    r   r   r   r   __len__d   s   
zDataLoader.__len__c                 C   s   t |tst|dk s|t| jkrt| j| }| |}t|dks&J t|d }t|d }t|d }|d }t	|||||f}|S )z Get a batch with index. r      r>         )

isinstancer,   	TypeErrorr-   r    
IndexErrorrO   torchtensorr   )r   keyr/   rK   rL   rM   	orig_textresultr   r   r   __getitem__g   s   


zDataLoader.__getitem__c           
      C   s   t |  \} }t |  \}}}}t| }dd |D }t||||f|\\}}}}}dd |D }t|dtj}t|tj}	t|dtj}t|dtj}|d|dksYJ dt	||	||||S )Nc                 S      g | ]}t |qS r   r-   r   r   r   r   r!      r"   z+DataLoader.__collate_fn.<locals>.<listcomp>c                 S   r^   r   r_   r   r   r   r   r!      r"   Tr>   z4Target input and output sequence sizes do not match.)
zipr-   r	   r   rF   PAD_IDrX   eqsizer   )
r    idxrK   rL   rM   r[   r#   lensorig_idxsrc_maskr   r   r   __collate_fnx   s   zDataLoader.__collate_fnc                 c   s$    t |  D ]}| |V  qd S r   )r4   rQ   r]   )r   r   r   r   r   __iter__   s   zDataLoader.__iter__c                 C   s    | j }| j }t| | j||dS )zConverts self to a DataLoader )
collate_fnr#   r5   )r#   r   DL_DataLoader__collate_fn)r   r#   r5   r   r   r   	to_loader   s   zDataLoader.to_loaderc                 C   s    | |}|rdd |D }|S )Nc                 S   s   g | ]}|gqS r   r   )r   er   r   r   r!      s    z'DataLoader.load_doc.<locals>.<listcomp>)get_mwt_expansions)r   r&   r   r    r   r   r   r'      s   
zDataLoader.load_doc)NFF)F)__name__
__module____qualname__r;   r(   rE   rO   rI   rQ   r]   staticmethodrl   ri   rm   r'   r   r   r   r   r      s    
$	
	

r   c                   @   s   e Zd ZdZdd ZdS )BinaryDataLoaderz
    This version of the DataLoader performs the same tasks as the regular DataLoader,
    except the targets are arrays of 0/1 indicating if the character is the location
    of an MWT split
    c                 C   sh   | j r|d n|d }dg}d}|D ]}|dkrd}q|r%d}|d q|d q|d ||fS )Nr   r>   F T)r   append)r   r   rB   rK   binary	has_spacecharr   r   r   rI      s   
zBinaryDataLoader.prepare_targetN)rp   rq   rr   __doc__rI   r   r   r   r   rt      s    rt   )$r.   numpynposcollectionsr   r   loggingrX   torch.nn.utils.rnnr   torch.utils.datar   rk   %stanza.models.common.seq2seq_constantmodelscommonseq2seq_constantrF   stanza.models.common.datar   r   r   r	   stanza.models.common.vocabr
   stanza.models.mwt.vocabr   stanza.models.common.docr   	getLoggerr0   r   r   r*   rt   r   r   r   r   <module>   s(    


 