o
    h                     @   s   d Z ddlZddlmZ ddlZddlZddlZddlZddlmZ ddl	m
Z
 ddlmZmZmZmZ ddlm  m  mZ edZG dd	 d	Zd
ee de
dee fddZde
dedee fddZdd Zdd ZdddZdd Zdd ZdS )z(Stanza models classifier data functions.    N)
namedtuple)List)WVType)PADPAD_IDUNKUNK_IDstanzac                   @   s.   e Zd Zd
ddZdd Zdd Zdd	 ZdS )SentimentDatumNc                 C   s   || _ || _|| _d S N	sentimenttextconstituency)selfr   r   r    r   Y/var/www/html/env_mimamsha/lib/python3.10/site-packages/stanza/models/classifiers/data.py__init__   s   
zSentimentDatum.__init__c                 C   s>   | |u rdS t |tsdS | j|jko| j|jko| j|jkS )NTF)
isinstancer
   r   r   r   )r   otherr   r   r   __eq__   s
   
$zSentimentDatum.__eq__c                 C   s   t |  S r   )str_asdictr   r   r   r   __str__   s   zSentimentDatum.__str__c                 C   s.   | j d u r| j| jdS | j| jt| j dS )N)r   r   r   )r   r   r   r   r   r   r   r   r   !   s   
zSentimentDatum._asdictr   )__name__
__module____qualname__r   r   r   r   r   r   r   r   r
      s
    
r
   sentencewordvec_typereturnc                 C   s   dd | D } dd | D } dd | D } dd | D } | g kr#dg} dd | D } |t jkr1| S |t jkrSg }| D ]}|dkrK|d	krKtd
d|}|| q:|S |t jkrZ| S |t jkra| S td	|)zg
    Process a line of text (with tokenization provided as whitespace)
    into a list of strings.
    c                 S   $   g | ]}| d D ]}|r	|q	qS )-split.0xyr   r   r   
<listcomp>.      $ zupdate_text.<locals>.<listcomp>c                 S   r!   )/r#   r%   r   r   r   r)   /   r*   c                 S      g | ]}|  qS r   )stripr&   r'   r   r   r   r)   0       c                 S   s   g | ]}|r|qS r   r   r.   r   r   r   r)   1   r/   r"   c                 S   r,   r   )lower)r&   wordr   r   r   r)   6   r/   01z[0-9]#zUnknown wordvec_type {})
r   WORD2VECGOOGLEresubappendFASTTEXTOTHER
ValueErrorformat)r   r   new_sentencer1   r   r   r   update_text'   s*   



r?   min_lenc              	      s   g }t | dD ])}t|dd}t|}W d   n1 s!w   Y  dd |D }|| q	fdd|D } rG fdd|D }|S )	zQ
    returns a list where the values of the list are
      label, [token...]
    ,zutf-8)encodingNc                 S   s*   g | ]}t |d  |d |ddfqS )r   r   r   N)r   getr.   r   r   r   r)   Q   s   * z read_dataset.<locals>.<listcomp>c              	      s@   g | ]}t |d  t|d  |d rt|d d  ndqS )r         N)r
   r?   tree_reader
read_treesr.   )r   r   r   r)   V   s   @ c                    s   g | ]}t |j kr|qS r   lenr   r.   )r@   r   r   r)   X   s    )r   r$   openjsonloadextend)datasetr   r@   linesfilenamefin	new_linesr   )r@   r   r   read_datasetH   s   rS   c                 C   sT   t dd | D }tdd |D r"dd tttt|D }|S tt|}|S )z-
    Returns a sorted list of label name
    c                 S   s   g | ]}|j qS r   )r   r.   r   r   r   r)   _   s    z"dataset_labels.<locals>.<listcomp>c                 s   s    | ]	}t d |V  qdS )z^[0-9]+$N)r7   match)r&   labelr   r   r   	<genexpr>`   s    z!dataset_labels.<locals>.<genexpr>c                 S   s   g | ]}t |qS r   )r   r.   r   r   r   r)   d   r/   )setallsortedmapintlist)rN   labelsr   r   r   dataset_labels[   s   r^   c                 C   sZ   t  }| D ]}|jD ]}|| q
qttgt| }|t tks'|t tkr+td|S )Nz"Unexpected values for PAD and UNK!)	rW   r   addr   r   r\   r   r   r<   )rN   vocabliner1   r   r   r   dataset_vocabi   s   
rb   Fc                 C   s|   t  }tttdd | D }|D ]}g ||< qt| D ]\}}|r1|t|j ||f q|t|j | q|S )z
    returns a dict mapping length -> list of items of that length

    an OrderedDict is used so that the mapping is sorted from smallest to largest
    c                 s   s    | ]}t |jV  qd S r   rH   r.   r   r   r   rV   z   s    z&sort_dataset_by_len.<locals>.<genexpr>)	collectionsOrderedDictrY   r\   rW   	enumeraterI   r   r9   )rN   
keep_indexsorted_datasetlengthslitem_idxitemr   r   r   sort_dataset_by_lens   s   
rl   c           	      C   s   g }|   D ]}t| | }t| || qg }g }|D ]&}|dkr3t|j|kr3||g q|| t||krE|| g }qt|dkrQ|| t| |S )z
    Given a dataset sorted by len, sorts within each length to make
    chunks of roughly the same size.  Returns all items as a single list.
    r   )keysr\   randomshufflerM   rI   r   r9   )	rg   
batch_sizebatch_single_itemrN   ri   itemsbatches
next_batchrk   r   r   r   shuffle_dataset   s&   




ru   c                    s2   t |} fdd|D }|rtdt| dS )z
    Check that all of the labels in the dataset are in the known labels.

    Actually, unknown labels could be acceptable if we just treat the model as always wrong.
    However, this is a good sanity check to make sure the datasets match
    c                    s   g | ]}| vr|qS r   r   )r&   ir]   r   r   r)      s    z check_labels.<locals>.<listcomp>z<Dataset contains labels which the model does not know about:N)r^   RuntimeErrorr   )r]   rN   
new_labels	not_foundr   rw   r   check_labels   s
   r{   )F) __doc__rc   r   loggingrK   rn   r7   typingr   stanza.models.classifiers.utilsr   stanza.models.common.vocabr   r   r   r   &stanza.models.constituency.tree_readermodelsr   rF   	getLoggerloggerr
   r   r?   r[   rS   r^   rb   rl   ru   r{   r   r   r   r   <module>   s(    
!

