o
    h                     @   s   d Z ddlZddlZddlZddlm  m  mZ ddl	m
Z
mZmZ edZdd ZejfddZd	d
 Zdd ZdddZdd Zdd ZeedfddZdS )z-
Utility functions for data transformations.
    N)HEADIDUPOSstanzac                    s    fdd| D }|S )Nc                    s"   g | ]}| v r | nt jqS  )constantUNK_ID.0tvocabr   T/var/www/html/env_mimamsha/lib/python3.10/site-packages/stanza/models/common/data.py
<listcomp>   s   " zmap_to_ids.<locals>.<listcomp>r   )tokensr   idsr   r   r   
map_to_ids   s   r   c                 C   s   g }| }t |d tr%|tdd |D  dd |D }t |d tstj|g|R  |}t| D ]\}}t|||dt|f< q5|S )z4 Convert (list of )+ tokens to a padded LongTensor. r   c                 s       | ]}t |V  qd S Nlen)r
   yr   r   r   	<genexpr>       z"get_long_tensor.<locals>.<genexpr>c                 S   s   g | ]	}|D ]}|qqS r   r   )r
   r   zr   r   r   r      s    z#get_long_tensor.<locals>.<listcomp>N)	
isinstancelistappendmaxtorch
LongTensorfill_	enumerater   )tokens_list
batch_sizepad_idsizesxr   isr   r   r   get_long_tensor   s   r*   c                 C   s   | d u s
| d d u rd S t dd | D }t| d d }t||| }t| D ]\}}t|||d t|d d f< q*|S )Nr   c                 s   r   r   r   )r
   r'   r   r   r   r   $   r   z#get_float_tensor.<locals>.<genexpr>)r   r   r   FloatTensorzero_r"   )features_listr$   seq_lenfeature_lenfeaturesr(   fr   r   r   get_float_tensor!   s   "r2   c                 C   sb   | g gkr
g gg fS |gt t|g t|  }dd ttt| dd D }|dd |d fS )zO Sort all fields by descending order of lens, and return the original indices. c                 S   s   g | ]}t |qS r   )r   r	   r   r   r   r   0   s    zsort_all.<locals>.<listcomp>T)reverse   N   )ranger   r   zipsorted)batchlensunsorted_all
sorted_allr   r   r   sort_all+   s
   

r=   皙?      ?c                    s   t | }tfdd| D }t fdd| D }t fdd| D }|dkr.td|dkr<td  dS || ||  }	|	dk rJdS |	| }
|
|krT|S |
S )	a  
    Returns X so that if you randomly select X * N sentences, you get 10%

    The ratio will be chosen in the assumption that the final dataset
    is of size N rather than N + X * N.

    should_augment_predicate: returns True if the sentence has some
      feature which we may want to change occasionally.  for example,
      depparse sentences which end in punct
    can_augment_predicate: in the depparse sentences example, it is
      technically possible for the punct at the end to be the parent
      of some other word in the sentence.  in that case, the sentence
      should not be chosen.  should be at least as restrictive as
      should_augment_predicate
    c                 3       | ]} |V  qd S r   r   r
   sentence)should_augment_predicater   r   r   D   r   z$get_augment_ratio.<locals>.<genexpr>c                 3   r@   r   r   rA   )can_augment_predicater   r   r   E   r   c                 3   s"    | ]} |o| V  qd S r   r   rA   rD   rC   r   r   r   F   s    r   zOcan_augment_predicate allowed sentences not allowed by should_augment_predicatez9Found no sentences which matched can_augment_predicate {}g        )r   sumAssertionErrorloggerwarningformat)
train_datarC   rD   desired_ratio	max_ration_datan_should_augmentn_can_augmentn_errorn_neededratior   rE   r   get_augment_ratio3   s$   rT   c                 C   s   | d }| td dkS )NPUNCT)getr   )rB   	last_wordr   r   r    should_augment_nopunct_predicateX   s   rY   c                    sN   | d    tddkrdS t t dkrdS t fdd| D r%dS dS )	zo
    Check that the sentence ends with PUNCT and also doesn't have any words which depend on the last word
    rU   NrV   Fr5   c                 3   s4    | ]}t |t d ko|t  t d kV  qdS )r5   r   N)r   r   r   )r
   wordrX   r   r   r   f   s   2 z0can_augment_nopunct_predicate.<locals>.<genexpr>T)rW   r   r   r   any)rB   r   r[   r   can_augment_nopunct_predicate\   s   r]   Tc                 C   s   t | dkrg S |du rt| ||}|dkr|rt| S g S g }| D ]'}||rIt |k rBt |dkrBt|dd }|| q"|rI|| q"|S )a  
    Adds extra training data to compensate for some models having all sentences end with PUNCT

    Some of the models (for example, UD_Hebrew-HTB) have the flaw that
    all of the training sentences end with PUNCT.  The model therefore
    learns to finish every sentence with punctuation, even if it is
    given a sentence with non-punct at the end.

    One simple way to fix this is to train on some fraction of training data with punct.

    Params:
    train_data: list of list of dicts, eg a conll doc
    augment_ratio: the fraction to augment.  if None, a best guess is made to get to 10%

    should_augment_predicate: a function which returns T/F if a sentence already ends with not PUNCT
    can_augment_predicate: a function which returns T/F if it makes sense to remove the last PUNCT

    TODO: do this dynamically, as part of the DataLoader or elsewhere?
    One complication is the data comes back from the DataLoader as
    tensors & indices, so it is much more complicated to manipulate
    r   Nr5   rU   )r   rT   r   randomr   )rK   augment_ratiorC   rD   keep_original_sentencesnew_datarB   new_sentencer   r   r   augment_punctj   s$   
rc   )r>   r?   )__doc__loggingr^   r   %stanza.models.common.seq2seq_constantmodelscommonseq2seq_constantr   stanza.models.common.docr   r   r   	getLoggerrH   r   PAD_IDr*   r2   r=   rT   rY   r]   rc   r   r   r   r   <module>   s$    


%