o
    hV/                     @   s2  d Z ddlZddlZddlZddlZddlZddlZddlZddl	m
Z
mZmZ ddlmZmZ ddlmZ ddlmZ ddlZedZG d	d
 d
e
ZG dd dZdd ZedkreddZed W d   n1 sqw   Y  eddZeej  ej!dddZ"ee" eddZeej  dS dS )z
Supports for pretrained data.
    N   )	BaseVocabVOCAB_PREFIXUNK_ID)open_read_binaryopen_read_text)DEFAULT_MODEL_DIR)UnpicklingErrorstanzac                       s$   e Zd Zdd Z fddZ  ZS )PretrainedWordVocabc                 C   s&   t | j | _dd t| jD | _d S )Nc                 S   s   i | ]\}}||qS  r   ).0iwr   r   X/var/www/html/env_mimamsha/lib/python3.10/site-packages/stanza/models/common/pretrain.py
<dictcomp>   s    z3PretrainedWordVocab.build_vocab.<locals>.<dictcomp>)r   data_id2unit	enumerate_unit2idselfr   r   r   build_vocab   s   zPretrainedWordVocab.build_vocabc                    s    t  |}|r|dd}|S )N     )supernormalize_unitreplace)r   unit	__class__r   r   r      s   z"PretrainedWordVocab.normalize_unit)__name__
__module____qualname__r   r   __classcell__r   r   r   r   r      s    r   c                   @   sn   e Zd ZdZdddZedd Zed	d
 Zdd Zdd Z	dddZ
dd Zedd ZedddZdS )Pretrainz/ A loader and saver for pretrained embeddings. NTc                 C   s"   || _ || _|| _|| _|| _d S N)filename_vec_filename_csv_filename
_max_vocab_save_to_file)r   r(   vec_filename	max_vocabsave_to_filecsv_filenamer   r   r   __init__%   s
   
zPretrain.__init__c                 C      t | ds	|   | jS )N_vocab)hasattrloadr3   r   r   r   r   vocab,      
zPretrain.vocabc                 C   r2   )N_emb)r4   r5   r8   r   r   r   r   emb2   r7   zPretrain.embc              
   C   s  | j d urtj| j rzkztj| j dd dd}W n ty3   tj| j dd dd}td Y nw t	
d| j  t|tsJtd	| j d
|vsRd|vrZtd| j t|d | _|d
 | _t| jtjrut| j| _W d S  ttfy     ty } z| js| js t	d| j | |  \}}W Y d }~n&d }~ww | js| jstd| j | j d urt	d| j   |  \}}|| _|| _| jr| j d usJ d|  | j  d S d S )Nc                 S      | S r'   r   storagelocr   r   r   <lambda>=       zPretrain.load.<locals>.<lambda>Tweights_onlyc                 S   r:   r'   r   r;   r   r   r   r>   ?   r?   Fa+  The saved pretrain has an old format using numpy.ndarray instead of torch to store weights.  This version of Stanza can support reading both the new and the old formats.  Future versions will only allow loading with weights_only=True.  Please resave the pretrained embedding using this version ASAP.zLoaded pretrain from {}zFile {} exists but is not a stanza pretrain file.  It is not a dict, whereas a Stanza pretrain should have a dict with 'emb' and 'vocab'r9   r6   zFile {} exists but is not a stanza pretrain file.  A Stanza pretrain file should have 'emb' and 'vocab' fields in its state dictzXPretrained file exists but cannot be loaded from {}, due to the following exception:
	{}zCPretrained file {} does not exist, and no text/xz file was providedz]Pretrained filename %s specified, but file does not exist.  Attempting to load from text filez<Filename must be provided to save pretrained vector to file.)!r(   ospathexiststorchr5   r	   warningswarnloggerdebugformat
isinstancedictRuntimeErrorr   load_state_dictr3   r8   npndarray
from_numpyKeyboardInterrupt
SystemExitBaseExceptionr)   r*   warningread_pretrainFileNotFoundErrorinfor,   save)r   r   er6   r9   r   r   r   r5   8   sL   


zPretrain.loadc              
   C   s   t j|\}}|rt j|dd | j | jd}ztj||dd t	
d| W d S  ttfy7     tyQ } zt	d| W Y d }~d S d }~ww )NT)exist_ok)r6   r9   F)_use_new_zipfile_serializationz(Saved pretrained vocab and vectors to {}zVSaving pretrained data failed due to the following exception... continuing anyway.
	{})rB   rC   splitmakedirsr6   
state_dictr9   rE   rY   rH   rX   rJ   rR   rS   rT   rU   )r   r(   	directory_r   rZ   r   r   r   rY   a   s   zPretrain.saveFc                 C   s   t |dM}|r| jd jd }|dt| j|f  t| jD ](\}}| j| d}|| |d |ddd |D  |d q!W d	   d	S 1 sUw   Y  d	S )
z9
        Write the vocab & values to a text file
        r   r   z%d %d
cpur   c                 S   s   g | ]}d |   qS )z%.6f)itemr   xr   r   r   
<listcomp>|   s    z'Pretrain.write_text.<locals>.<listcomp>
N)	openr9   shapewritelenr6   r   tojoin)r   r(   headerfoutword_dimword_idxwordrowr   r   r   
write_textp   s   

"zPretrain.write_textc                 C   s   | j d ur| | j | j\}}}n| jd ur| | j\}}ntdt|tt t|kr3td| jttkrW| jt|tt k rW|d | jtt  }|d | j }t|}||fS )NzVector file is not provided.z8Loaded number of vectors does not match number of words.)	r)   read_from_filer+   r*   read_from_csvrM   rk   r   r   )r   wordsr9   failedr6   r   r   r   rV      s   

$zPretrain.read_pretrainc           
      C   s   t d|  t| }t|}|D ]} dd |D }W d   n1 s&w   Y  t|}t|d d }tj|tt |ftj	d}t
|D ]\}}tjdd || d D tj	d||tt < qId	d |D }	|	|fS )
zD
        Read vectors from CSV

        Skips the first row
        z/Reading pretrained vectors from csv file %s ...c                 S   s   g | ]}|qS r   r   r   liner   r   r   rf      s    z*Pretrain.read_from_csv.<locals>.<listcomp>Nr   r   dtypec                 S      g | ]}t |qS r   floatrd   r   r   r   rf          c                 S   s   g | ]
}|d   ddqS )r   r   r   )r   ry   r   r   r   rf      s    )rH   rX   r   csvreaderrk   rE   zerosr   float32r   tensor)
r(   fin
csv_readerrz   linesrowscolsr9   r   rw   r   r   r   rv      s   

2zPretrain.read_from_csvc              
      s  t d|  td}d}d g }d}d}t| k}t|D ]^\}}	z|	 }	W n ty5   |d7 }Y qw |	 }	|	s=q|	|	}
|rSd}t
|
dkrSt|
d  q|
d d	krk|durct d
 nt d |
}q|rw|dk swt
||k r|||
 qW d   n1 sw   Y   du rtdd |D d  t
|}tj|t
t  ftjd}|durtjdd |  d D tjd|t< t|D ]\}}	tjdd |	  d D tjd||t
t < qɇ fdd|D }|dkrt d| |||fS )zR
        Open a vector file using the provided function and read from it.
        z&Reading pretrained vectors from %s ...z[ \t]+TNr   r   F   z<unk>zFMore than one <unk> line in the pretrain!  Keeping the most recent onez,Found an unk line while reading the pretrainc                 s   s    | ]}t |V  qd S r'   )rk   rd   r   r   r   	<genexpr>   s    z*Pretrain.read_from_file.<locals>.<genexpr>r{   c                 S   r}   r   r~   rd   r   r   r   rf      r   z+Pretrain.read_from_file.<locals>.<listcomp>c                 S   r}   r   r~   rd   r   r   r   rf      r   c                    s    g | ]}d  |d   qS )r   N)rm   ry   r   r   r   rf      s     z&Failed to read %d lines from embedding)rH   rX   recompiler   r   decodeUnicodeDecodeErrorrstripr]   rk   interrorrI   appendminrE   r   r   r   r   r   )r(   r.   tab_space_patternfirstr   rx   unk_linefr   rz   piecesr   r9   rw   r   r   r   ru      s\   




(2
zPretrain.read_from_file)NNr&   TN)Fr'   )r!   r"   r#   __doc__r1   propertyr6   r9   r5   rY   rt   rV   staticmethodrv   ru   r   r   r   r   r%   "   s    


)

r%   c                 C   s   | r| S t j|d|}t j|rtd|  |S td|  t j|d|}t j|r=td|  |S td|  |ddkrst jt|dd|	dd	d	 }t j|rltd|  |S td|  |S )
a`  
    When training a model, look in a few different places for a .pt file

    If a specific argument was passsed in, prefer that location
    Otherwise, check in a few places:
      saved_models/{model}/{shorthand}.pretrain.pt
      saved_models/{model}/{shorthand}_pretrain.pt
      ~/stanza_resources/{language}/pretrain/{shorthand}_pretrain.pt
    z{}.pretrain.ptzFound existing .pt file in %sz$Cannot find pretrained vectors in %sz{}_pretrain.ptra   r   pretrainz{}.ptr   )
rB   rC   rm   rJ   rD   rH   rI   findr   r]   )wordvec_pretrain_filesave_dir	shorthandlangdefault_pretrain_filepretrain_filer   r   r   find_pretrain_file   s&   
$r   __main__ztest.txtr   z3 2
a 1 1
b -1 -1
c 0 0
ztest.ptTr@   )#r   r   rB   r   lzmaloggingnumpyrO   rE   r6   r   r   r   stanza.models.common.utilsr   r   stanza.resources.commonr   pickler	   rF   	getLoggerrH   r   r%   r   r!   rh   ro   rj   r   printr9   r5   re   r   r   r   r   <module>   s:    
 K)


