o
    hV                     @   s*   d dl Z d dlZd dlZG dd dZdS )    Nc                   @   sF   e Zd ZdZdddZ		dddZedddZdd Zdd Z	dS )
DataLoadera$  
    Class for loading language id data and providing batches

    Attempt to recreate data pre-processing from: https://github.com/AU-DIS/LSTM_langid

    Uses methods from: https://github.com/AU-DIS/LSTM_langid/blob/main/src/language_datasets.py

    Data format is same as LSTM_langid
    Nc                 C   s(   d | _ d | _d | _d | _d | _|| _d S N)batchesbatches_iter
tag_to_idx
idx_to_taglang_weightsdevice)selfr	    r   T/var/www/html/env_mimamsha/lib/python3.10/site-packages/stanza/models/langid/data.py__init__   s   
zDataLoader.__init__F      c                    sj  g }|D ]}	|dd t |	 dD 7 }qt| dd |D }t|}tdd |D t|  }
|
D ]}t|||< q8|_	dd t
dd j	 D D _dd |D }d	urjfd
d|D }|rg }|D ]"}|d }|d tj|g|d |d d}|fdd|D 7 }qp|}t| i  |D ]8}|d }|d t| vrg  t|< fddt|D } t| || f ||   d7  < q D ]	t   qg } D ] fddtdt  D D ]}|| qqfdd|D _t|fdd|D }tj|jtjd_tj tj_d	S )z
        Load sequence data and labels, calculate weights for weighted cross entropy loss.
        Data is stored in a file, 1 example per line
        Example: {"text": "Hello world.", "label": "en"}
        c                 S   s   g | ]}|  r|qS r   )strip.0xr   r   r   
<listcomp>$       z(DataLoader.load_data.<locals>.<listcomp>
c                 S   s   g | ]}t |qS r   )jsonloadsr   r   r   r   r   &       c                 S      g | ]}|d  qS labelr   r   r   r   r   r   *       c                 S   r      r   r   ir   r   r   r   .   r   c                 S   s   g | ]\}}||fqS r   r   )r   kvr   r   r   r   .   r   c                 S   s   g | ]}d qS r   r   )r   _r   r   r   r   1   s    Nc                    s&   g | ]}|d  d  |d dqS )textNr   r'   r   r   r   )
max_lengthr   r   r   5   s   & r'   r   r    r   )	upper_lim	lower_limc                    s   g | ]}| d qS )r(   r   )r   seqr   r   r   r   ?   r   c                    s   g | ]
}  | d  qS )UNK)get)r   c)
char_indexr   r   r   J   s    c                    s    g | ]}  ||  qS r   r   r!   )batch_lengths
batch_sizelengthr   r   r   S   s     c                    s   g | ]}  |qS r   )build_batch_tensors)r   batchr
   r   r   r   W   r   c                    s"   g | ]} | t d |d  qS )r       )maxr   )most_frequentr   r   r   \   s   " r	   dtype)openreadsplitrandomshuffledictsetkeyslenr   sorteditemsr   r   randomize_datalistappendranger   r8   torchtensorr	   floatr   iterr   )r
   r2   
data_filesr0   	tag_index	randomizerandomize_ranger)   examples	data_file
new_labels	new_labellang_countssplit_examplesexamplesequence	sequencessequence_as_listr   sublistr   )r1   r2   r0   r   r3   r)   r9   r
   r   	load_data   sb   	"
$

zDataLoader.load_datar   r   c           
      C   s   g }| D ]=}|}|t |k rAt||}tt ||}|d| }|| ||d dd}	t |	dkr7n
|	d }|t |k sqt| |S )z
        Takes the original data and creates random length examples with length between upper limit and lower limit
        From LSTM_langid: https://github.com/AU-DIS/LSTM_langid/blob/main/src/language_datasets.py
        N r    )rD   r?   randintminrI   r>   r@   )
	sentencesr*   r+   new_datasentence	remaininglimmnew_sentencer>   r   r   r   rG   c   s    

	zDataLoader.randomize_datac                 C   sN   t  }tjdd |D | jtjd|d< tjdd |D | jtjd|d< |S )z5
        Helper to turn batches into tensors
        c                 S   r   r%   r   r   sr   r   r   r      r   z2DataLoader.build_batch_tensors.<locals>.<listcomp>r:   rb   c                 S   r   r   r   ri   r   r   r   r      r   targets)rA   rK   rL   r	   long)r
   r5   batch_tensorsr   r   r   r4   y   s   ""zDataLoader.build_batch_tensorsc                 C   s
   t | jS r   )nextr   r6   r   r   r   rn      s   
zDataLoader.nextr   )Fr   N)r   r   )
__name__
__module____qualname____doc__r   r^   staticmethodrG   r4   rn   r   r   r   r   r      s    


Jr   )r   r?   rK   r   r   r   r   r   <module>   s    