o
    hk                     @   s   d dl Z d dlZd dlZd dlmZ d dlZd dlZd dlm	  m
  mZ d dlmZmZmZmZ d dlmZ d dlmZmZ d dlmZ d dlT edZG d	d
 d
ZdS )    N)Counter)
map_to_idsget_long_tensorget_float_tensorsort_all)
DeltaVocab)Vocab
MultiVocab)edit)*stanzac                   @   s   e Zd ZdddZdd Zdd Zd	d
 Zdd Zdd Zdd Z	e
dd Ze
dd Ze
dd Ze
dd Ze
dd ZdS )
DataLoaderNFc	                    s   | _ || _|| _| j | _|| _|  |rd S |d ur1tt|ks'J dd t|D |d urO|rK|d }	t|d }
t	|
|	d| _
n|| _
nt | _
| \}
}	t	|
|	d| _
|dddk r| jst|d t }t|td|d  | | j
d | j
d || jrttt}t| fd	d|D t| _ fd
dtdt D | _tdt d S )Nc                 S   s   g | ]\}}|s|qS  r   ).0xyr   r   S/var/www/html/env_mimamsha/lib/python3.10/site-packages/stanza/models/lemma/data.py
<listcomp>    s    z'DataLoader.__init__.<locals>.<listcomp>poschar)r   r   sample_traing      ?z%Subsample training set with rate {:g}c                    s   g | ]} | qS r   r   r   i)datar   r   r   :       c                    s   g | ]
}||   qS r   r   r   
batch_sizer   r   r   r   >       r   z{} batches created.)r   argsevalshuffleddocraw_datalenzipr   r	   vocabdict
init_vocabgetintrandomsampleloggerdebugformat
preprocesslistrangeshufflenum_examplesr   )selfr!   r   r   r%   
evaluation
conll_onlyskipexpand_unk_vocab	pos_vocab
char_vocabkeepindicesr   r   r   __init__   sB   


 zDataLoader.__init__c                 C   s\   | j du s	J dddd |D }t|| jd }dd |D }t|| jd }||fS )	NFz$Vocab file must exist for evaluation c                 s   s     | ]}|d  |d  V  qdS )r      Nr   r   dr   r   r   	<genexpr>D   s    z(DataLoader.init_vocab.<locals>.<genexpr>langc                 S   s   g | ]}|d  qS )   r   r@   r   r   r   r   F   r   z)DataLoader.init_vocab.<locals>.<listcomp>)r   joinr   r   )r4   r   	char_datar:   pos_datar9   r   r   r   r'   B   s   zDataLoader.init_vocabc              	   C   s   g }|D ]R}t jt |d |d  }t|d }tjg| tjg }||}|d }	||	}	t|d }
|tjg|
 }||
tjg }|||||	||d gg7 }q|S )Nr   r?   rD   )	r
   
EDIT_TO_IDget_edit_typer0   constantSOSEOSmapunit2id)r4   r   r:   r9   r   	processedrA   	edit_typesrcr   tgttgt_intgt_outr   r   r   r/   J   s   

zDataLoader.preprocessc                 C   s
   t | jS N)r#   r   r4   r   r   r   __len__Y   s   
zDataLoader.__len__c                 C   s  t |tst|dk s|t| jkrt| j| }t|}tt| }t|dks+J dd |d D }t||\}}|d }t	||}t
|tj}t	|d |}t	|d |}	t
|d }
t
|d }|d	 }|d|	dksyJ d
||||	|
|||fS )z Get a batch with index. r      c                 S   s   g | ]}t |qS r   )r#   r   r   r   r   r   r   h   r   z*DataLoader.__getitem__.<locals>.<listcomp>rD   r?            z4Target input and output sequence sizes do not match.)
isinstancer)   	TypeErrorr#   r   
IndexErrorr0   r$   r   r   torcheqrJ   PAD_ID
LongTensorsize)r4   keybatchr   lensorig_idxrQ   src_maskrS   rT   r   editstextr   r   r   __getitem__\   s(   


zDataLoader.__getitem__c                 c   s$    t |  D ]}| |V  qd S rU   )r1   rW   rl   )r4   r   r   r   r   __iter__w   s   zDataLoader.__iter__c                 C   s   |  | j| jdd| jS )NcaselessF)load_docr!   r   r(   r   rV   r   r   r   r"   {   s   zDataLoader.raw_datac                 C   s`   |r|  tttg}n| j ttttttgdd}t|}t	|}t
|}|r.t|}|S )NT)as_sentences)r(   TEXTUPOSLEMMAHEADDEPRELMISCr   remove_goeswithextract_correct_formsresolve_nonelowercase_data)r!   rn   r5   r   r   r   r   ro   ~   s   



zDataLoader.load_docc           	      C   s   g }g }| D ]H}|d }|s| |dd  q|d}|D ]%}|drD|jdddd }| ||d |d	 f | ||f  n
q| |dd  qd
d |D }|D ]\}}|d |vrk| |dd  qX|S )a  
        Here we go through the raw data and use the CorrectForm of words tagged with CorrectForm

        In addition, if the incorrect form of the word is not present in the training data,
        we keep the incorrect form for the lemmatizer to learn from.
        This way, it can occasionally get things right in misspelled input text.

        We do check for and eliminate words where the incorrect form is already known as the
        lemma for a different word.  For example, in the English datasets, there is a "busy"
        which was meant to be "buys", and we don't want the model to learn to lemmatize "busy" to "buy"
        NrZ   |zCorrectForm==rD   )maxsplitr?   c                 S   s   h | ]}|d  qS )r   r   rY   r   r   r   	<setcomp>   r   z3DataLoader.extract_correct_forms.<locals>.<setcomp>r   )appendsplit
startswith)	r   new_dataincorrect_formswordmiscpiececfknown_wordscorrect_formr   r   r   rx      s.   

	z DataLoader.extract_correct_formsc                    sv   g }t   | D ]1}   t|D ]\}}|d dkr) |  |d d  q| fddt|D  q|S )aH  
        This method specifically removes words that goeswith something else, along with the something else

        The purpose is to eliminate text such as

1	Ken	kenrice@enroncommunications	X	GW	Typo=Yes	0	root	0:root	_
2	Rice@ENRON	_	X	GW	_	1	goeswith	1:goeswith	_
3	COMMUNICATIONS	_	X	ADD	_	1	goeswith	1:goeswith	_
        r[   goeswithrZ   rD   c                    s   g | ]
\}}| vr|qS r   r   )r   idxr   remove_indicesr   r   r      r   z.DataLoader.remove_goeswith.<locals>.<listcomp>)setclear	enumerateaddextend)r   filtered_datasentenceword_idxr   r   r   r   rw      s   
zDataLoader.remove_goeswithc                 C   s   | D ]
}|d   |d< q| S )Nr   )lower)r   tokenr   r   r   rz      s   zDataLoader.lowercase_datac                 C   sH   t t| D ]}t t| | D ]}| | | d u r d| | |< qq| S )N_)r1   r#   )r   tok_idxfeat_idxr   r   r   ry      s   zDataLoader.resolve_none)NFFNF)__name__
__module____qualname__r=   r'   r/   rW   rl   rm   r"   staticmethodro   rx   rw   rz   ry   r   r   r   r   r      s$    
0

%

r   )r*   numpynposcollectionsr   loggingr`   %stanza.models.common.seq2seq_constantmodelscommonseq2seq_constantrJ   stanza.models.common.datar   r   r   r   stanza.models.common.vocabr   stanza.models.lemma.vocabr   r	   stanza.models.lemmar
   stanza.models.common.doc	getLoggerr,   r   r   r   r   r   <module>   s    
