o
    hp                     @   s  d Z ddlZddlmZ ddlZddlZddlmZ ddlZddl	Z	ddl
Z
ddlmZ ddlmZmZmZ ddlmZmZmZ ddlT dd	lmZ dd
lmZ e Zdd ZddgZddddddddddddddZdd e D Ze e! Z"dd  Z#d!d" Z$d#d$ Z%d%d& Z&d'd( Z'd)d* Z(d+d, Z)d-d. Z*d/d0 Z+d1d2 Z,d3d4 Z-d5d6 Z.d7d8 Z/d9d: Z0d;d< Z1d=d> Z2d?d@ Z3dAdB Z4dCdD Z5dEdF Z6dGdH Z7dIdJ Z8dKdL Z9dMdN Z:dOdP Z;dQdR Z<dSdT Z=dUdV Z>dWdX Z?dYdZ Z@d[d\ ZAd]d^ ZBd_d` ZCdadb ZDeEdckreD  dS dS )da3  
Converts a directory of models organized by type into a directory organized by language.

Also produces the resources.json file.

For example, on the cluster, you can do this:

python3 -m stanza.resources.prepare_resources --input_dir /u/nlp/software/stanza/models/current-models-1.5.0 --output_dir /u/nlp/software/stanza/models/1.5.0 > resources.out 2>&1
nlprun -a stanza-1.2 -q john "python3 -m stanza.resources.prepare_resources --input_dir /u/nlp/software/stanza/models/current-models-1.5.0 --output_dir /u/nlp/software/stanza/models/1.5.0" -o resources.out
    N)defaultdict)Path)__resources_version__)
lcode2langtwo_to_three_lettersthree_to_two_letters)PACKAGESTRANSFORMERSTRANSFORMER_NICKNAMES)*)DATASET_MAPPING)get_tqdmc                  C   s   t  } | jdtdt dd | jdtdt dd | jdd	d
dd | jdtd dd |  }tj|j	|_	tj|j
|_
|jd urPd|j  |_|S )Nz--input_dirz//u/nlp/software/stanza/models/current-models-%szRInput dir for various models.  Defaults to the recommended home on the nlp cluster)typedefaulthelpz--output_dirz /u/nlp/software/stanza/models/%szOutput dir for various models.z--packages_only
store_trueFz<Only build the package maps instead of rebuilding everything)actionr   r   z--langzOnly process this language or a comma-separated list of languages.  If left blank, will prepare all languages.  To use this argument, a previous prepared resources with all of the languages is necessary.,)argparseArgumentParseradd_argumentstrr   
parse_argsospathabspath	input_dir
output_dirlangjoinstripsplit)parserargs r$   ]/var/www/html/env_mimamsha/lib/python3.10/site-packages/stanza/resources/prepare_resources.pyr      s   
r   thmy	tokenizermwt_expander
lemmatizertaggerr"   pretrain	nertaggerforward_charlmbackward_charlm	sentimentconstituencycoreflangid)tokenizemwtlemmaposdepparser,   nerr.   r/   r0   r1   r2   r3   c                 C   s   i | ]\}}||qS r$   r$   ).0ijr$   r$   r%   
<dictcomp>F   s    r=   c                 C   s   t | jddd d S )NT)parentsexist_ok)r   mkdir)dirr$   r$   r%   
ensure_dirI   s   rB   c                 C   s   t t|j t| | d S N)rB   r   parentshutilcopy2)srcdstr$   r$   r%   	copy_fileM   s   rI   c                 C   s   t | d }t| S )Nrb)openreadhashlibmd5	hexdigest)r   datar$   r$   r%   get_md5R   s   rQ   c                 C   s   | dd  dd} tt dd dD ]}| |r,| dt|d   } t| } nqtd	|  | dd\}}|||fS )
zc
    Split model names by _

    Takes into account packages with _ and processor types with _
    N._c                 S   s
   t |  S rC   )len)xr$   r$   r%   <lambda>_   s   
 z"split_model_name.<locals>.<lambda>)key   z#Could not find a processor type in )replacesortedending_to_processorkeysendswithrU   AssertionErrorr!   )model	processorr   packager$   r$   r%   split_model_nameW   s   

rc   c                 C   s   |  dr| d d } |  dr| d d } | ddfS |  dr+| d d } | ddfS |  d	r;| d d
 } | ddfS | d}|dkrZ| |d d  }|t v rZ| d | ddfS | ddfS )N
_finetunedi_nopretrainiF	_nocharlmiT_charlmirT   r   rY   )r^   rfindknown_nicknames)rb   
underscorenicknamer$   r$   r%   split_packagei   s"   









rl   c                 C   sf   t |\}}}|r| tv rd S |d ur#| |v r#|||  v r#||  | S | |v r+||  S td| |f )Nz-pretrain not specified for lang %s package %s)rl   no_pretrain_languagesRuntimeError)r   rb   model_pretrainsdefault_pretrainsuses_pretrainrT   r$   r$   r%   get_pretrain_package   s   rr   c                 C   sJ   t |\}}}|sd S |d ur| |v r|||  v r||  | S || d S rC   )rl   get)r   rb   model_charlmsdefault_charlmsrT   uses_charlmr$   r$   r%   get_charlm_package   s   rw   c                 C   sR   t | |d t}d|dg}t| d }|d ur'|d|d |d|d |S Nr,   r`   rb   r.   r/   rr   rp   ru   rs   appendr   rb   pretrain_packagedependenciescharlm_packager$   r$   r%   get_con_dependencies   s   r   c                 C      t | |ttS rC   )rw   pos_charlmsru   r   rb   r$   r$   r%   get_pos_charlm_package      r   c                 C   `   g }t | |tt}|d ur|d|d t| |}|d ur.|d|d |d|d |S rx   )rr   pos_pretrainsrp   r{   r   r   rb   r~   r}   r   r$   r$   r%   get_pos_dependencies      
r   c                 C   s@   t |\}}}|sd S |sd S d| |f tvrd S t| |i tS )Nz%s_%s)rl   LEMMA_CLASSIFIER_DATASETSrr   rp   )r   rb   rq   rv   r$   r$   r%   get_lemma_pretrain_package   s   r   c                 C   r   rC   )rw   lemma_charlmsru   r   r$   r$   r%   get_lemma_charlm_package   r   r   c                 C   \   g }t | |}|d ur|d|d t| |}|d ur,|d|d |d|d |S rx   )r   r{   r   r   r$   r$   r%   get_lemma_dependencies      

r   c                 C   r   rC   )rw   depparse_charlmsru   r   r$   r$   r%   get_depparse_charlm_package   r   r   c                 C   r   rx   )rr   depparse_pretrainsrp   r{   r   r   r$   r$   r%   get_depparse_dependencies   r   r   c                 C   r   rC   )rw   ner_charlmsru   r   r$   r$   r%   get_ner_charlm_package   r   r   c                 C   r   rC   )rr   ner_pretrainsrp   r   r$   r$   r%   get_ner_pretrain_package   r   r   c                 C   r   rx   )r   r{   r   r   r$   r$   r%   get_ner_dependencies   r   r   c                 C   sR   t | |dt}d|dg}t| d}|dur'|d|d |d|d |S )a  
    Return a list of dependencies for the sentiment model

    Generally this will be pretrain, forward & backward charlm
    So far, this invariant is true:
    sentiment models use the default pretrain for the language
    also, they all use the default charlm for a language
    Nr,   ry   r.   r/   rz   r|   r$   r$   r%   get_sentiment_dependencies   s   	r   c                 C   sp   | dkr	t ||S | dkrt||S | dkrt||S | dkr$t||S | dkr-t||S | dkr6t||S i S )z
    Get the dependencies for a particular lang/package based on the package name

    The package can include descriptors such as _nopretrain, _nocharlm, _charlm
    which inform whether or not this particular model uses charlm or pretrain
    r8   r6   r7   r9   r0   r1   )r   r   r   r   r   r   )ra   r   rb   r$   r$   r%   get_dependencies  s   





r   c              
   C   s  t t| j}i }| jr'tttj	| j
d}| jdD ]}i ||< q |D ]}td|  t ttj	| j|}t|D ]l}|dsKqCt|\}}}| jr^|| jdvr^qCtj	| j||}	tj	| j
|d||d }
t|	|
 t|
}t|||}||vri ||< ||| vri || |< |r||d|| | |< qCd|i|| | |< qCq)td tj|ttj	| j
dd	d
d d S )Nresources.jsonr   zProcessing models in .ptmodels)rN   r~   rN   zHProcessed initial model directories.  Writing preliminary resources.jsonw   indent)r[   r   listdirr   r   jsonloadrK   r   r   r   r!   printtqdmr^   rc   rI   rQ   r   dump)r#   dirs	resourcesr   	model_dirr   r`   rb   ra   
input_pathoutput_pathrN   r~   r$   r$   r%   process_dirs"  s6   

&r   c                 C   2   t | |}|d ur|d S | tv r|d S |d S Nrg   re   rf   )r   rm   r   
ud_packager   r$   r$   r%   get_default_pos_packageI     
r   c                 C   r   r   )r   rm   r   r$   r$   r%   get_default_depparse_packageQ  r   r   c                 C   sf  t ttj| jd}|D ]}|dkrqd|| v rqtdd ||  D r,q|t	vr7t
| d| jrC|| jdvrCqtd|  tt}|| t d	 }| D ]/\}}|d
krg|dkrgqZ|dkrlqZ|| | t|||}|D ]}||d  |d  q{qZg }	tD ]C}||v rt|| D ]6}tj| j|d||d }
tj|
rtd|||
 |	|
||f qtd| d| d| d|
 qttj| j|dddtj}|	D ]\}
}}|j|
tj||d d qW d    n	1 sw   Y  ttj| j|dd}||| d< qtd t j|ttj| jdddd d S )Nr   urlaliasc                 s       | ]}|d v V  qdS )r/   r.   r,   	lang_nameNr$   r:   kr$   r$   r%   	<genexpr>a      z'process_default_zips.<locals>.<genexpr> not in default treebanks!!!r   z&Preparing default models for language r   r6   identityoptionalr`   rb   r   r   z   Model {} package {}: file {}z
Processor z	 package z needed for z but cannot be found at zdefault.zipr   )filenamearcnamedefault_md5z5Processed default model zips.  Writing resources.jsonr   r   ) r   r   rK   r   r   r   r   allr]   default_treebanksr_   r   r!   r   r   setr   itemsaddr   
PROCESSORSr[   existsformatr{   FileNotFoundErrorzipfileZipFileZIP_DEFLATEDwriterQ   r   )r#   r   r   models_neededpackagesra   rb   r~   
dependencymodel_filesr   zipfr   r$   r$   r%   process_default_zipsY  sX   
 "&r   c                 C   s  |dkrddiS t | }i }|tv rt| |d< n||d< d| | v r1|d | | d v r1||d< d| | v rH|d }|| | d v rG||d< n|tvrPd|d< d	| | v rpt|||d	< |d	 | | d	 vrotd
|d	  n
|tvrztd| d| | v rt|||d< |d | | d vrtd|d  n
|tvrtd| |tv rt| |d< |tv rt| |d< |tv rt| |d< t	| |}|r||d< |S )z
    Build a default package for this language

    Will add each of pos, lemma, depparse, etc if those are available
    Uses the existing models scraped from the language directories into resources.json, as relevant
    multilingualr3   udr4   r5   r6   rf   r   r7   z'Expected POS model not in resources: %sz+Expected to find POS models for language %sr8   z,Expected depparse model not in resources: %sz0Expected to find depparse models for language %sr9   r0   r1   r   )
r   default_tokenizerallowed_empty_languagesr   r_   r   default_nersdefault_sentimentdefault_constituencyget_default_optional_processors)r   r   default_packagedefault_processorsexpected_lemmar   r$   r$   r%   get_default_processors  sP    
r   c                 C   s0   i }|t v rt | |d< |tv rt| |d< |S )Nr1   r2   )optional_constituencyoptional_coref)r   r   optional_processorsr$   r$   r%   r     s   r   c                 C   s^   ||vrd S ||  dd|  dd| }|| | | v r$|||< d S td|||f  d S )Nrg   rT   rf   zKWARNING: wanted to use %s for %s accurate %s, but that model does not exist)rZ   r   )r   r   current_processorsra   transformer	new_modelr$   r$   r%    update_processor_add_transformer  s    r   c                 C   s   t | |}d|v r7|d dkr7|d }|dd}t||}|dur7|| | d v r/||d< ntd||f  tt|dd}|durldD ]
}t| |||| qGd|v rl|d dsd|d drlt| ||d| t	| |}|rw||d	< |S )
z]
    A package that, if available, uses charlm and transformer models for each processor
    r6   r   rf   rg   NzVWARNING: wanted to use %s for %s default_accurate lemma, but that model does not existr7   r8   r1   r0   r9   r   )
r   rZ   r   r   r
   rs   r	   r   r^   get_optional_accurate)r   r   r   lemma_modelr   r   ra   r   r$   r$   r%   get_default_accurate  s&   


$
r   c                 C   sX   t | |}tt|d d }|d ur dD ]
}t| |||| q|tv r*t| |d< |S )Nr   r2   )r   r
   rs   r	   r   r   )r   r   r   r   ra   r$   r$   r%   r     s   
r   c                 C   sr   t | |}d|v r|d | D ]$\}}d|v r6|dd}|| | | vr2td|||f  q|||< q|S )a  
    Build a packages entry which only has the nocharlm models

    Will make it easy for people to use the lower tier of models

    We do this by building the same default package as normal,
    then switching everything out for the lower tier model when possible.
    We also remove constituency, as it is super slow.
    Note that in the case of a language which doesn't have a charlm,
    that means we wind up building the same for default and default_nocharlm
    r1   rg   rf   zYWARNING: wanted to use %s for %s default_fast processor %s, but that model does not exist)r   popr   rZ   r   )r   r   r   ra   r`   nocharlmr$   r$   r%   get_default_fast  s   

r   c           
      C   s  t ttj| jd}|D ]}|dkrqd|| v rqtdd ||  D r,q|t	vr7t
| d| jrC|| jdvrCqt||}||| d< i || t< ||| t d	< |tvr~|d
kr~t||}||| t d< t||}||| t d< d|| v r,|| d D ]}d|i}d|| v r||| d v r||d< d|| v r|d || d v r|d |d< n|d || d v r|d |d< d|| v rd|v r|d }||| d v r||d< d|| v r#d|v r#d}	|d || d v r|d }	n|d || d v r|d }	|	dur#d|vrd|d< |	|d< ||| t |< qqtd t j|ttj| jdddd dS )zz
    Build a package for a language's default processors and all of the treebanks specifically used for that language
    r   r   r   c                 s   r   r   r$   r   r$   r$   r%   r   -  r   z#process_packages.<locals>.<genexpr>r   r   r   r   r   default_fastdefault_accurater4   r5   r7   rg   rf   r6   r8   Nr   z+Processed packages.  Writing resources.jsonr   r   r   )r   r   rK   r   r   r   r   r   r]   r   r_   r   r!   r   r   rm   r   r   r   r   )
r#   r   r   r   r   r   rb   
processorslemma_packagedepparse_packager$   r$   r%   process_packages!  sb   






&r   c                 C   s  t ttj| jd}i }|d |d< |D ]\}|dkrqd|| v r%q|tvr0t|d  qt| }||| d< ||	  ||	 < d|	 i||	 < |	 t
v rad|	 i|t
|	  < q|	 tv rsd|	 i|t|	  < qtd t j|ttj| jdddd	 d S )
Nr   r   r   z not found in lcode2lang!r   z0Processed lcode aliases.  Writing resources.jsonr   r   r   )r   r   rK   r   r   r   r   r   r   lowerr   r   r   )r#   r   resources_newr   r   r$   r$   r%   process_lcodem  s,   &r   c                 C   sh   t ttj| jd}ddi|d< ddi|d< d|d< td	 t j|ttj| jdd
dd d S )Nr   r   nbnozzh-hanszhz_https://huggingface.co/stanfordnlp/stanza-{lang}/resolve/v{resources_version}/models/{filename}r   z2Finalized misc attributes.  Writing resources.jsonr   r   r   )	r   r   rK   r   r   r   r   r   r   )r#   r   r$   r$   r%   process_misc  s   &r   c                  C   sV   t  } td| j| jf  | jst|  t|  | js)t|  t|  t	|  d S d S )NzConverting models from %s to %s)
r   r   r   r   packages_onlyr   r   r   r   r   )r#   r$   r$   r%   main  s   r  __main__)F__doc__r   collectionsr   r   r   pathlibr   rM   rE   r   stanzar   stanza.models.common.constantr   r   r   !stanza.resources.default_packagesr   r	   r
   .stanza.utils.datasets.prepare_lemma_classifierr   r   stanza.utils.get_tqdmr   r   r   r   processor_to_endingr   r\   listr]   r   rB   rI   rQ   rc   rl   rr   rw   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r  __name__r$   r$   r$   r%   <module>   s    '49

L

