o
    h+                     @   s   d dl m Z  d dlmZmZ d dlmZ d dlZd dlZdZd Z	dZ
dZdZd	Zd
ZdZee
eegZeeZG dd dZG dd deZG dd deZG dd dZG dd deZdS )    )copy)CounterOrderedDict)IterableNz<PAD>z<UNK>   z<EMPTY>   z<ROOT>   c                   @   s   e Zd ZdZd"ddZdd	 Zd
d Zedd Zdd Z	dd Z
dd Zdd Zdd Zdd Zdd Zdd Zdd Zed d! ZdS )#	BaseVocabz| A base class for common vocabulary operations. Each subclass should at least 
    implement its own build_vocab() function.N r   Fc                 C   s<   || _ || _|| _|| _|| _|d ur|   g d| _d S )N)langidxcutofflower_unit2id_id2unit)datar   r   r   r   build_vocabstate_attrs)selfr   r   r   r   r    r   U/var/www/html/env_mimamsha/lib/python3.10/site-packages/stanza/models/common/vocab.py__init__   s   zBaseVocab.__init__c                 C   s   t d)NzfThis BaseVocab does not have build_vocab implemented.  This method should create _id2unit and _unit2idNotImplementedErrorr   r   r   r   r      s   zBaseVocab.build_vocabc                 C   s.   t  }| jD ]}t| |rt| |||< q|S )zw Returns a dictionary containing all states that are necessary to recover
        this vocab. Useful for serialization.)r   r   hasattrgetattr)r   stateattrr   r   r   
state_dict"   s   

zBaseVocab.state_dictc                 C   s(   |  }|  D ]
\}}t||| q|S )z= Returns a new Vocab instance constructed from a state dict. )itemssetattr)clsr   newr   valuer   r   r   load_state_dict+   s   zBaseVocab.load_state_dictc                 C   s   |d u r|S | j r|  S |S N)r   r   unitr   r   r   normalize_unit3   s
   zBaseVocab.normalize_unitc                 C   s(   |  |}|| jv r| j| S | jt S r&   )r)   r   UNKr'   r   r   r   unit2id<   s   



zBaseVocab.unit2idc                 C   
   | j | S r&   r   )r   idr   r   r   id2unitC      
zBaseVocab.id2unitc                        fdd|D S )Nc                       g | ]}  |qS r   )r+   .0xr   r   r   
<listcomp>G       z!BaseVocab.map.<locals>.<listcomp>r   )r   unitsr   r   r   mapF      zBaseVocab.mapc                    r1   )Nc                    r2   r   )r/   r3   r   r   r   r6   J   r7   z#BaseVocab.unmap.<locals>.<listcomp>r   )r   idsr   r   r   unmapI   r:   zBaseVocab.unmapc                 C   s2   | j rd| j  nd}tt| | }d|| jf S )Nz(%s)r
   z<%s: %s>)r   strtyper   )r   lang_strnamer   r   r   __str__L   s   zBaseVocab.__str__c                 C   s
   t | jS r&   )lenr   r   r   r   r   __len__Q   r0   zBaseVocab.__len__c                 C   s:   t |tr
| |S t |tst |tr| |S td)Nz*Vocab key must be one of str, list, or int)
isinstancer=   r+   intlistr/   	TypeErrorr   keyr   r   r   __getitem__T   s
   


zBaseVocab.__getitem__c                 C   s   |  || jv S r&   )r)   r   rH   r   r   r   __contains__\   s   zBaseVocab.__contains__c                 C   s   t | S r&   rB   r   r   r   r   size_   s   zBaseVocab.size)Nr
   r   r   F)__name__
__module____qualname____doc__r   r   r   classmethodr%   r)   r+   r/   r9   r<   rA   rC   rJ   rK   propertyrM   r   r   r   r   r	      s$    

	
	r	   c                       s(   e Zd ZdZ fddZdd Z  ZS )
DeltaVocaba  
    A vocab that starts off with a BaseVocab, then possibly adds more tokens based on the text in the given data

    Currently meant only for characters, such as built by MWT or Lemma

    Expected data format is either a list of strings, or a list of list of strings
    c                    s(   || _ t j||j|j|j|jd d S )N)r   r   r   r   r   )
orig_vocabsuperr   r   r   r   r   )r   r   rU   	__class__r   r   r   k   s   "zDeltaVocab.__init__c                    s   t dd  jD rd j}nddd  jD } fdd|D }t|dkrNtt|} jj|  _t jj	 _	|D ]
}t j	 j	|< qAd S  jj _ jj	 _	d S )Nc                 s   s    | ]}t |tV  qd S r&   )rD   r=   )r4   wordr   r   r   	<genexpr>p   s    z)DeltaVocab.build_vocab.<locals>.<genexpr>r
   c                 S      g | ]	}|D ]}|qqS r   r   )r4   sentencerY   r   r   r   r6   s       z*DeltaVocab.build_vocab.<locals>.<listcomp>c                    s   g | ]
}| j jvr|qS r   )rU   r   )r4   cr   r   r   r6   u   s    r   )
allr   joinrB   sortedsetrU   r   dictr   )r   allcharsunkr^   r   r   r   r   o   s   
zDeltaVocab.build_vocab)rN   rO   rP   rQ   r   r   __classcell__r   r   rW   r   rT   c   s    rT   c                       sZ   e Zd ZdZd fdd	Zdd	 Zd
d Zdd Zdd Zdd Z	dd Z
dd Z  ZS )CompositeVocaba   Vocabulary class that handles parsing and printing composite values such as
    compositional XPOS and universal morphological features (UFeats).

    Two key options are `keyed` and `sep`. `sep` specifies the separator used between
    different parts of the composite values, which is `|` for UFeats, for example.
    If `keyed` is `True`, then the incoming value is treated similarly to UFeats, where
    each part is a key/value pair separated by an equal sign (`=`). There are no inherit
    order to the keys, and we sort them alphabetically for serialization and deserialization.
    Whenever a part is absent, its internal value is a special `<EMPTY>` symbol that will
    be treated accordingly when generating the output. If `keyed` is `False`, then the parts
    are treated as positioned values, and `<EMPTY>` is used to pad parts at the end when the
    incoming value is not long enough.Nr
   r   Fc                    s4   || _ || _t j|||d |  jddg7  _d S )Nr   sepkeyed)ri   rj   rV   r   r   )r   r   r   r   ri   rj   rW   r   r   r      s   zCompositeVocab.__init__c                 C   s   | j sdd |D }n|| j }| jrBt|dkr#|d dkr#t S dd |D }tdd |D r<td	|| j f t|}|S |dkrHg }|S )
Nc                 S   s   g | ]}|qS r   r   r3   r   r   r   r6      s    z-CompositeVocab.unit2parts.<locals>.<listcomp>r   r   _c                 S   s   g | ]}| d qS )=)splitr3   r   r   r   r6      r7   c                 s   s    | ]	}t |d kV  qdS )r   NrL   r3   r   r   r   rZ      s    z,CompositeVocab.unit2parts.<locals>.<genexpr>zReceived "%s" for a dictionary which is supposed to be keyed, eg the entries should all be of the form key=value and separated by %s)ri   rm   rj   rB   rc   any
ValueError)r   r(   partsr   r   r   
unit2parts   s   zCompositeVocab.unit2partsc                    sD    | jr fddjD S  fddttjD S )Nc                    s.   g | ]}| v rj |  | tntqS r   )r   getUNK_IDEMPTY_IDr4   krp   r   r   r   r6      s   . z*CompositeVocab.unit2id.<locals>.<listcomp>c                    s2   g | ]}|t  k rj|  | tntqS r   )rB   r   rr   rs   rt   )r4   irw   r   r   r6      s   2 )rq   rj   r   rangerB   r'   r   rw   r   r+      s   
zCompositeVocab.unit2idc                 C   s   t | jdkrt|ts|f}g }t|| j D ]%\}}|tkr"q| jr4|d	|| j| |  q|| j| |  q| j
d urR| j
|}|dkrPd}|S |S )Nr   z{}={}r
   rk   )rB   r   rD   r   zipkeysrt   rj   appendformatri   r`   )r   r.   r    vrv   resr   r   r   r/      s   

zCompositeVocab.id2unitc           	         s~   fdd j D } jrNt  _|D ]+} |}|D ]!}| jvr*tt j|< ||  j| vr= j| ||  qqt jdkrMtt jd< nSt  _ fdd|D }t	dd |D }|D ],}t
|D ]%\}}| jvr|tt j|< |t|k r| j| vr j| | qlqft jdkrtt jd< t fddt j D  _ fdd	 jD  _d S )
Nc                    s    g | ]}|D ]}| j  qqS r   rh   )r4   sentwr   r   r   r6      s     z.CompositeVocab.build_vocab.<locals>.<listcomp>r   rk   c                    r2   r   )rq   )r4   ur   r   r   r6      r7   c                 S   s   g | ]}t |qS r   rL   )r4   pr   r   r   r6      s    c                    s   g | ]	}| j | fqS r   r-   ru   r   r   r   r6      r]   c                    s&   i | ]}|d d t  j| D qS )c                 S      i | ]\}}||qS r   r   r4   rx   r   r   r   r   
<dictcomp>   r7   z9CompositeVocab.build_vocab.<locals>.<dictcomp>.<dictcomp>)	enumerater   ru   r   r   r   r      s   & z.CompositeVocab.build_vocab.<locals>.<dictcomp>)r   rj   rc   r   rq   r   VOCAB_PREFIXr|   rB   maxr   r   ra   r{   r   )	r   allunitsr   rp   rI   allpartsmaxlenrx   r   r   r   r   r      s<   

	
"zCompositeVocab.build_vocabc                    s    fdd j D S )Nc                    s   g | ]	}t  j| qS r   )rB   r   ru   r   r   r   r6      r]   z'CompositeVocab.lens.<locals>.<listcomp>)r   r   r   r   r   lens   s   zCompositeVocab.lensc                 C   r,   r&   r-   )r   r   r   r   r   r       r0   zCompositeVocab.itemsc                 C   s.   dd | j  D }dt| d|}|S )Nc                 S   s"   g | ]\}}d d | d qS )[,])r`   )r4   rk   r5   r   r   r   r6      s   " z*CompositeVocab.__str__.<locals>.<listcomp>z	<{}:
 {}>z
 )r   r    r}   r>   r`   )r   piecesrepr   r   r   rA      s   zCompositeVocab.__str__)Nr
   r   r
   F)rN   rO   rP   rQ   r   rq   r+   r/   r   r   r    rA   rf   r   r   rW   r   rg      s    'rg   c                   @   sV   e Zd ZdZdddZdd Zdd Zd	d
 Zdd Zdd Z	dd Z
edd ZdS )BaseMultiVocaba$   A convenient vocab container that can store multiple BaseVocab instances, and support 
    safe serialization of all instances via state dicts. Each subclass of this base class 
    should implement the load_state_dict() function to specify how a saved state dict 
    should be loaded back.Nc                 C   sN   t  | _|d u r
d S tdd | D sJ | D ]	\}}|| j|< qd S )Nc                 S   s   g | ]}t |tqS r   )rD   r	   )r4   r~   r   r   r   r6      r7   z+BaseMultiVocab.__init__.<locals>.<listcomp>)r   _vocabsr_   valuesr    )r   
vocab_dictrv   r~   r   r   r   r      s   zBaseMultiVocab.__init__c                 C   s   || j |< d S r&   r   )r   rI   itemr   r   r   __setitem__  s   zBaseMultiVocab.__setitem__c                 C   r,   r&   r   rH   r   r   r   rJ     r0   zBaseMultiVocab.__getitem__c                 C   s   d t| d| j S )Nz
<{}: [{}]>z, )r}   r>   r`   r   r{   r   r   r   r   rA     s   zBaseMultiVocab.__str__c                 C   s
   || j v S r&   r   rH   r   r   r   rK     r0   zBaseMultiVocab.__contains__c                 C   s
   | j  S r&   )r   r{   r   r   r   r   r{     r0   zBaseMultiVocab.keysc                 C   s*   t  }| j D ]
\}}| ||< q|S )zG Build a state dict by iteratively calling state_dict() of all vocabs. )r   r   r    r   )r   r   rv   r~   r   r   r   r     s   zBaseMultiVocab.state_dictc                 C   s   t )z5 Construct a MultiVocab by reading from a state dict.r   )r"   r   r   r   r   r%     s   zBaseMultiVocab.load_state_dictr&   )rN   rO   rP   rQ   r   r   rJ   rA   rK   r{   r   rR   r%   r   r   r   r   r      s    
	r   c                   @   s   e Zd Zdd ZdS )	CharVocabc                    s   t jd d ttfr,tfddjD  t  D ]} | jk r* |= qn
tdd jD  tttt   fdddd _	d	d
 t
j	D _d S )Nr   c                    s*   g | ]}|D ]}| j  D ]}|qqqS r   rh   )r4   r   r   r^   r   r   r   r6   "  s   * z)CharVocab.build_vocab.<locals>.<listcomp>c                 S   r[   r   r   )r4   r   r^   r   r   r   r6   '  r]   c                    s    |  | fS r&   r   )rv   )counterr   r   <lambda>(  s    z'CharVocab.build_vocab.<locals>.<lambda>T)rI   reversec                 S   r   r   r   r   r   r   r   r   )  r7   z)CharVocab.build_vocab.<locals>.<dictcomp>)rD   r   rF   tupler   r{   r   r   ra   r   r   r   )r   rv   r   )r   r   r   r      s   (zCharVocab.build_vocabN)rN   rO   rP   r   r   r   r   r   r     s    r   )r   collectionsr   r   collections.abcr   ospicklePADPAD_IDr*   rs   EMPTYrt   ROOTROOT_IDr   rB   VOCAB_PREFIX_SIZEr	   rT   rg   r   r   r   r   r   r   <module>   s(    Qt+