o
    –h¸&  ã                   @   sÄ   d Z ddlZddlmZ ddlZddlZddlZddlZddl	m
Z
 ddlmZ ddlmZ ddlmZmZ ddlmZmZ ddlmZ e d	¡Ze ej¡ d
ZdZdd„ Zedkr`eƒ  dS dS )a#  
Loads NER models & separates out the word vectors to base & delta

The model will then be resaved without the base word vector,
greatly reducing the size of the model

This may be useful for any external users of stanza who have an NER
model they wish to reuse without retraining

If you know which pretrain was used to build an NER model, you can
provide that pretrain.  Otherwise, you can give a directory of
pretrains and the script will test each one.  In the latter case,
the name of the pretrain needs to look like lang_dataset_pretrain.pt
é    N)Údefaultdict)ÚPipeline)Úlang_to_langcode)ÚPretrainÚPretrainedWordVocab)ÚPAD_IDÚVOCAB_PREFIX)ÚTrainerÚstanzaFg-Cëâ6?c            (         sd  t  ¡ } | jdtddd | jdtddd | jdtd	d
d |  ¡ }tj |j¡rA|j}t 	|¡}t
|ƒdkr@td |j¡ƒ‚ntj |j¡sPtd |j¡ƒ‚tj |j¡\}}|g}tj |j¡r|j}t 	|¡‰ttƒ}ˆD ]‰|ˆ d¡d   ˆ¡ qqntj |¡\}‰ˆg‰t‡fdd„ƒ}|j}tj|dd g }g }g }	|D ]Ä}
tj ||
¡}d}|
 |¡s¾td |
¡ƒ‚|
d t
|ƒ … }|jddd\}}tdƒ td||f ƒ t|ddd|i|d}|jd }td |¡ƒ |jd }|jj}|d ‰|jjjj d }t!|j"d ƒ}||kr*|dkr|d ks*td! ||j"d |¡ƒ‚t#t$|| ||  ƒƒ}|D ]l}tj ||¡}td" |¡ƒ t%|d#‰td$ ˆj&j ¡ƒ td% |jjjj ¡ƒ ˆj&j d |jjjj d krwtd&ƒ q8t'ˆj&j d |jjjj d ƒ}ˆj&j d |jjjj d krÄt(‡fd'd(„t)|ƒD ƒƒr¨td)ƒ ntd*ƒ q8ˆj&j d |jjjj d k rÄtd+ |¡ƒ t*|j +¡ ƒj,}|jjjd |…d d …f ˆj& -|¡d |…d d …f  ‰ ˆ  .¡ ‰ t/j0j1ˆ dd, 2¡  3¡ }t4 5|dk ¡dkrtd-ƒ‚t4 5|t6k ¡}||d. krAtd/|||f ƒ ˆj&j d |jjjj d kr?td0ƒ ˆj‰ˆ|d< ˆj&j d |j"d1<  nrtd2|||f ƒ t5‡fd3d(„ˆD ƒƒ}td4| ƒ t7r¤d}ˆD ];}|ˆjvrlqbˆ 8|¡} |jjj| d d …f }!ˆj 8|¡}"ˆj&|"d d …f }#|! .¡  2¡ |#  1¡ t6k rœ|d7 }qbtd5| ƒ q8td6 |¡ƒ | |
¡ q¦d7| 9¡ vs¼J ‚‡ fd8d9„t)d:ƒD ƒ}$g }%t)d:t
|ƒƒD ]}&||& d;krê|% ˆ :|&¡¡ |$ ˆ |&  2¡ ¡ qÐ|jj; d<¡ t
|%ƒdkrtd=ƒ |	 |
¡ nTtd>t
|%ƒ ƒ td?t
|$ƒ ƒ t4 <|$¡}$t/ =|$¡}$|$j d t
|%ƒt
t>ƒ ks.J ‚t|$j ƒ t?|%ˆj@ˆjAd@}%|%|d7< tB C|$j d |$j d tD¡|j_E|jjEjjF G|$¡ tj ||
¡}'| H|'¡ | |
|f¡ q¦tƒ  t
|ƒdkr„tdAƒ |D ]}&t|&ƒ q|t
|ƒdkr™tdBƒ |D ]}&t|&ƒ q‘t
|	ƒdkr®tdCƒ |	D ]	}&t|&ƒ q¦d S d S )DNz--input_pathzsaved_models/nerz*Where to find NER models (dir or filename))ÚtypeÚdefaultÚhelpz--output_pathzsaved_models/shrunkz&Where to write shrunk NER models (dir)z--pretrain_pathzsaved_models/pretrainz)Where to find pretrains (dir or filename)r   zNo ner models found in {}zNo ner model found at path {}Ú_c                      s   ˆ S ©N© r   )Ú	pretrainsr   úf/var/www/html/env_mimamsha/lib/python3.10/site-packages/stanza/utils/training/separate_ner_pretrain.pyÚ<lambda>B   s    zmain.<locals>.<lambda>T)Úexist_okz_nertagger.ptzUnexpected name: {}é   )Úmaxsplitz/===============================================zProcessing lang %s package %sztokenize,nerÚner)Ú
processorsÚtokenize_pretokenizedÚpackageÚner_model_pathzLoaded NER processor: {}ÚwordÚlangÚzhzzh-hansz#lang not as expected: {} vs {} ({})zAttempting pretrain: {})Úfilenamez"  pretrain shape:               {}z"  embedding in ner model shape: {}z%  DIMENSION DOES NOT MATCH.  SKIPPINGc                 3   s$    | ]}ˆ   |¡ˆ   |¡kV  qd S r   )Úid2unit©Ú.0Úx)Ú
word_vocabr   r   Ú	<genexpr>{   s   €" zmain.<locals>.<genexpr>z=  Attempting to use pt vectors to replace ner model's vectorsz:  NUM VECTORS DO NOT MATCH.  WORDS DO NOT MATCH.  SKIPPINGzR  WARNING: if any vectors beyond {} were fine tuned, that fine tuning will be lost)Údimz,This should not be - a norm was less than 0!é   z*  Accepted!  %d of %d vectors match for %sz+  Setting model vocab to match the pretrainÚword_emb_dimz,  %d of %d vectors matched for %s - SKIPPINGc                 3   s    | ]}|ˆ j v V  qd S r   )Úvocabr!   )Úptr   r   r%   “   s   € z  %d words were in both vocabsz1  %d vectors were close when ignoring id orderingz COULD NOT FIND A MATCHING PT: {}Údeltac                    s   g | ]}ˆ |   ¡ ‘qS r   )Úcpu)r"   Úi)r+   r   r   Ú
<listcomp>«   s    zmain.<locals>.<listcomp>é   g        Úword_embzJNo vectors were changed!  Perhaps this model was trained without finetune.z%d delta vocabz%d vectors in the delta set)r   ÚlowerzFinal pretrain mappings:zMISSING EMBEDDINGS:zNOT FINE TUNED:)IÚargparseÚArgumentParserÚadd_argumentÚstrÚ
parse_argsÚosÚpathÚisdirÚ
input_pathÚlistdirÚlenÚFileNotFoundErrorÚformatÚisfileÚsplitÚpretrain_pathr   ÚlistÚappendÚoutput_pathÚmakedirsÚjoinÚendswithÚ
ValueErrorÚprintr   r   ÚtrainersÚmodelr)   r0   ÚweightÚshaper   ÚargsÚsortedÚsetr   ÚembÚminÚallÚrangeÚnextÚ
parametersÚdeviceÚtoÚdetachÚtorchÚlinalgÚnormr,   ÚnumpyÚnpÚsumÚEPSÚDEBUGÚunit2idÚkeysr    Úunsaved_modulesÚstackÚ
from_numpyr   r   r   r1   ÚnnÚ	Embeddingr   Ú	delta_embÚdataÚcopy_Úsave)(ÚparserrN   Úner_model_dirÚnersÚpt_model_dirÚlang_to_pretrainÚnew_dirÚfinal_pretrainsÚmissing_pretrainsÚno_finetuneÚ	ner_modelÚner_pathÚexpected_endingÚ
short_namer   r   ÚpipeÚner_processorÚtrainerr)   Únum_vectorsÚlcodeÚner_pretrainsÚpt_modelÚpt_pathÚNrW   Údelta_normsÚnum_matchingÚ
vocab_sameÚrearranged_countr#   Úx_idÚx_vecÚpt_idÚpt_vecÚdelta_vectorsÚdelta_vocabr-   Únew_pathr   )r+   r   r*   r$   r   Úmain%   s  
ÿ
ÿ






6
€€
€

 

ýrŽ   Ú__main__)Ú__doc__r2   Úcollectionsr   Úloggingr7   r]   r^   rZ   Útorch.nnrg   r
   r   Ústanza.models.common.constantr   Ústanza.models.common.pretrainr   r   Ústanza.models.common.vocabr   r   Ústanza.models.ner.trainerr	   Ú	getLoggerÚloggerÚsetLevelÚERRORra   r`   rŽ   Ú__name__r   r   r   r   Ú<module>   s,    
 2
ÿ