o
    h                     @   s   d dl mZ d dlZd dlZd dlZd dlZd dlmZmZm	Z	m
Z
 d dlZd dlZd dlmZ edZG dd dZdejd	ed
ee fddZdd Zdd Zdd Zedkr_e  dS dS )    )CounterN)ListTupleAnyMapping)DEFAULT_BATCH_SIZEzstanza.lemmaclassifierc                   @   sD   e Zd Zedddfdededededef
d	d
Zdd Z	dd Z
dS )DatasetFNT	data_path
batch_size
get_countslabel_decodershufflec                 C   s*  |du s
t j|std| d|du ri }nt|}td| t }t }t|ddd}g g g g t	 i f\}	}
}}}}t
|}|d }|d	 | _t|D ]\}}|d
|d|d|df\}}}}d||||fv r~td| d| ||d}|du rt|||< g }|D ]}||vrt|||< |||  q|	| |
| || |||  |r|||   d7  < |||  || qRW d   n1 sw   Y  |	| _|
| _|| _|| _|| _|| _|| _|| _|| _dd t|D | _tdd |D | _dS )a  
        Loads a data file into data batches for tokenized text sentences, token indices, and true labels for each sentence.

        Args:
            data_path (str): Path to data file, containing tokenized text sentences, token index and true label for token lemma on each line.
            batch_size (int): Size of each batch of examples
            get_counts (optional, bool): Whether there should be a map of the label index to counts

        Returns:
            1. List[List[List[str]]]: Batches of sentences, where each token is a separate entry in each sentence
            2. List[torch.tensor[int]]: A batch of indexes for the target token corresponding to its sentence
            3. List[torch.tensor[int]]: A batch of labels for the target token's lemma
            4. List[List[int]]: A batch of UPOS IDs for the target token (this is a List of Lists, not a tensor. It should be padded later.)
            5 (Optional): A mapping of label ID to counts in the dataset.
            6. Mapping[str, int]: A map between the labels and their indexes
            7. Mapping[str, int]: A map between the UPOS tags and their corresponding IDs found in the UPOS batches
        Nz
Data file z could not be found.z2Final label decoder: %s  Should be strings to intszr+zutf-8)encoding	sentencesuposwordsindex	upos_tagslemmaz@Expected data to be complete but found a null value in sentence z:    c                 S   s   g | ]}|  qS  lower.0xr   r   _/var/www/html/env_mimamsha/lib/python3.10/site-packages/stanza/models/lemma_classifier/utils.py
<listcomp>d   s    z$Dataset.__init__.<locals>.<listcomp>c                 s   s    | ]}|  V  qd S )Nr   r   r   r   r   	<genexpr>e   s    z#Dataset.__init__.<locals>.<genexpr>) ospathexistsFileNotFoundErrordictloggerdebugsetopenr   jsonloadtarget_upos	enumerateget
ValueErrorlenappendaddupdater   indicesupos_idslabelscountsr   
upos_to_idr
   r   sortedknown_wordstarget_words)selfr	   r
   r   r   r   r9   r8   finr   r2   r4   r3   r5   r6   
input_jsonsentences_dataidxsentencer   
target_idxr   labellabel_idconverted_upos_tagsupos_tagr   r   r   __init__   s\   

,


"zDataset.__init__c                 C   s$   t | j| j t | j| j dk S )z@
        Number of batches, rounded up to nearest batch
        r   )r.   r   r
   r:   r   r   r   __len__g   s   $zDataset.__len__c           
      #   s    t  j}tt|} jrt| t  D ]P} j| }t| j |} fdd||| D }t	
 fdd||| D } fdd||| D }t	
 fdd||| D }	||||	fV  qd S )Nc                       g | ]} j | qS r   )r   r   rF   r   r   r   v       z$Dataset.__iter__.<locals>.<listcomp>c                    rH   r   )r2   r   rF   r   r   r   w   rI   c                    rH   r   )r3   r   rF   r   r   r   x   rI   c                    rH   r   )r4   r   rF   r   r   r   y   rI   )r.   r   listranger   randomrG   r
   mintorchtensor)
r:   num_sentencesr2   ibatch_start	batch_endbatch_sentencesbatch_indicesbatch_upos_idsbatch_labelsr   rF   r   __iter__m   s   


  zDataset.__iter__)__name__
__module____qualname__r   strintboolr#   rE   rG   rX   r   r   r   r   r      s    (Wr   tokenized_indicesunknown_token_idxreturnc                    s    fddt | D S )a  
    Extracts the indices within `tokenized_indices` which match `unknown_token_idx`

    Args:
        tokenized_indices (torch.tensor): A tensor filled with tokenized indices of words that have been mapped to vector indices.
        unknown_token_idx (int): The special index for which unknown tokens are marked in the word vectors.

    Returns:
        List[int]: A list of indices in `tokenized_indices` which match `unknown_token_index`
    c                    s   g | ]
\}}| kr|qS r   r   )r   r>   token_indexr`   r   r   r      s    z1extract_unknown_token_indices.<locals>.<listcomp>)r+   )r_   r`   r   rc   r   extract_unknown_token_indices|   s   rd   c                  C   s:   t jjr	t d} t jj rt d} | S t d} | S )z/
    Get the device to run computations on
    cudampscpu)rN   re   is_availabledevicebackendsrf   )ri   r   r   r   
get_device   s   


rk   c                 C   s4   |dkrdS | | }|dkr| ||  }|S | }|S )Nr   z3Error: The second number (multiple) cannot be zero.r   )numbermultiple	remainderrounded_numberr   r   r   round_up_to_multiple   s   rp   c                  C   s8   t jt jtddd} t| dd\}}}}}}}d S )N	test_setsprocessed_ud_enzcombined_dev.txtT)r   )r   r    joindirname__file__load_dataset)default_test_pathsentence_batchesindices_batchesupos_batches_r5   r6   r   r   r   main   s   r|   __main__)collectionsr   r(   loggingr   rL   typingr   r   r   r   stanzarN   (stanza.models.lemma_classifier.constantsr   	getLoggerr$   r   rO   r]   rd   rk   rp   r|   rY   r   r   r   r   <module>   s$    
m
