o
    he                     @   sl  d Z ddlmZmZ ddlZddlZddlZddlZddlZddl	m
Z
 ddlZddlZddlZddlZddlmZ ddlmZ ddlmZmZmZmZmZmZmZ ddlmZmZ dd	lmZ dd
l m!Z! e"dZ#e$e
% Z&dZ'dZ(e)de(d Z*e)de!Z+e)ddZ,e)dej-.e&dZ/dZ0G dd de1Z2G dd de3Z4G dd de3Z5edg dZ6dd Z7d d! Z8d"d# Z9d$d% Z:d&d' Z;dMd(d)Z<dNd+d,Z=dOd.d/Z>d0d1 Z?d2d3 Z@dPd4d5ZAd6d7 ZBd8d9 ZCd:d; ZDd<d= ZEe/e*de+ddfd>d?ZFe/dfd@dAZGdBdC ZHe/e*de+dfdDdEZIdFdG ZJe/e+e,dd-fdHdIZKdJe/di dde*de+e,dd-fdKdLZLdS )Qz(
Common utilities for Stanza resources.
    )defaultdict
namedtupleN)Path)tqdm)
make_table)TOKENIZEMWTPOSLEMMADEPPARSENER	SENTIMENT)PIPELINE_NAMESPROCESSOR_VARIANTS)PACKAGES)__resources_version__stanzaz:https://nlp.stanford.edu/software/stanza/stanza-resources/z?https://raw.githubusercontent.com/stanfordnlp/stanza-resources/STANZA_RESOURCES_URLmainSTANZA_RESOURCES_VERSIONSTANZA_MODEL_URLdefaultSTANZA_RESOURCES_DIRstanza_resources)pretrainforward_charlmbackward_charlmc                          e Zd Z fddZ  ZS )ResourcesFileNotFoundErrorc                    s   t  d| d || _d S )NzResources file not found at: z"  Try to download the model again.)super__init__resources_filepath)selfr!   	__class__ R/var/www/html/env_mimamsha/lib/python3.10/site-packages/stanza/resources/common.pyr    -   s   
z#ResourcesFileNotFoundError.__init____name__
__module____qualname__r    __classcell__r%   r%   r#   r&   r   ,       r   c                       r   )UnknownLanguageErrorc                       t  d|  || _d S )NzUnknown language requested: )r   r    unknown_languager"   unknownr#   r%   r&   r    2      
zUnknownLanguageError.__init__r'   r%   r%   r#   r&   r-   1   r,   r-   c                       r   )UnknownProcessorErrorc                    r.   )Nz"Unknown processor type requested: )r   r    unknown_processorr0   r#   r%   r&   r    7   r2   zUnknownProcessorError.__init__r'   r%   r%   r#   r&   r3   6   r,   r3   ModelSpecification	processorpackagedependenciesc                 C   s   t | jddd dS )z/
    Create dir in case it does not exist.
    T)parentsexist_okN)r   mkdir)pathr%   r%   r&   
ensure_dir=   s   r>   c              
   C   sl   zt | d}| }W d   n1 sw   Y  W n ty. } z|js)| |_ d}~ww t| S )z&
    Get the MD5 value of a path.
    rbN)openreadOSErrorfilenamehashlibmd5	hexdigest)r=   findataer%   r%   r&   get_md5C   s   
rJ   c                 C   s\   t d|  d| d ttj| |}||  W d   dS 1 s'w   Y  dS )zD
    Fully unzip a file `filename` that's in a directory `dir`.
    zUnzip: /z...N)loggerdebugzipfileZipFileosr=   join
extractall)r=   rC   fr%   r%   r&   unzipP   s   "rT   c                 C   s>   t | d}t|jdksJ d|  dtj|jd jS )z:
    Get the root directory from a archived zip file.
    rr   zZip file at fz( seems to be corrupted. Please check it.)rN   rO   lenfilelistrP   r=   dirnamerC   )rC   zfr%   r%   r&   get_root_from_zipfileX   s
   
rZ   c                 C   s   t j| ot| |kS )zN
    Check if the file at `path` exists and match the provided md5 value.
    )rP   r=   existsrJ   )r=   rE   r%   r%   r&   file_existsa      r\   c                 C   sd   t j| sttjd| |r.t| }||kr0||kr%td| || d S t	d| ||f d S d S )NzCannot find expected filez?Found a possibly older version of file %s, md5 %s instead of %szmd5 for %s is %s, expected %s)
rP   r=   r[   FileNotFoundErrorerrnoENOENTrJ   rL   rM   
ValueError)r=   rE   alternate_md5file_md5r%   r%   r&   assert_file_existsg   s   rd   Fc              	   C   s   t jdv }tj| d|d}|r|  t|dV}t|jd}d}d|  }	t|dd| |	d	#}
|j	|d
D ]}|rL|
| |  |
t| q8W d   n1 sWw   Y  W d   |jS W d   |jS 1 sqw   Y  |jS )z<
    Download a URL into a file as specified by `path`.
    )r   
      T)streamproxieswbzcontent-lengthi   zDownloading B)totalunit
unit_scaledisabledesc)
chunk_sizeN)rL   levelrequestsgetraise_for_statusr@   intheadersr   iter_contentwriteflushupdaterV   status_code)urlr=   rh   rt   verboserU   rS   	file_sizedefault_chunk_sizero   pbarchunkr%   r%   r&   download_filer   s6   



r   Tc           
      C   s   t |j}t| t||r$|rtd|  dS td|  dS tj|d"}t	j
|t	j
|d }	t| |	|| t	|	| W d   n1 sOw   Y  t||| |rftd|  dS td|  dS )a  
    A complete wrapper over download_file() that also make sure the directory of
    `path` exists, and that a file matching the md5 value does not exist.

    alternate_md5 allows for an alternate md5 that is acceptable (such as if an older version of a file is okay)
    zFile exists: N)dirzDownloaded file to )r   parentr>   r\   rL   inforM   tempfileTemporaryDirectoryrP   r=   rQ   splitr   replacerd   )
r|   r=   rh   rE   rt   log_inforb   basedirtemptemppathr%   r%   r&   request_file   s"   

r   c                 C   sd   g }t D ]}| D ]}|d |kr|| qq| D ]}|D ]}|d |d kr) nq|| q|S )Nr   )r   append)processor_listsorted_listr7   itemr%   r%   r&   sort_processors   s   

r   c                 C   s   | t  }||| t v r#t|| t | v r#td|| || t< dS ||| t  v rFt|| v rH||| t v rJtd|| || t< dS dS dS dS )a  Add mwt if tokenize is passed without mwt.

    If tokenize is in the list, but mwt is not, and there is a corresponding
    tokenize and mwt pair in the resources file, mwt is added so no missing
    mwt errors are raised.

    TODO: how does this handle EWT in English?
    z8Language %s package %s expects mwt, which has been addedN)r   r   r   rL   warning)
processors	resourceslangvaluer%   r%   r&   add_mwt   s   	$,r   c                 C   s  t t}|rZtd |rt|v rt|vrt|| | | D ]8\}}t|t	s.t
dt|ttt	fs:t
dt|t	rB|g}|tvrP|rL|tvrPt||D ]}	|| | v rw|	| | | v rwtd| d|	 d || |	 qR|	| | t v r|| | t |	 v rtd| d| | t |	 |  d || | | t |	 |  qR|	| | t v rd| | t |	 v r|| | t |	 d v rtd| d| | t |	 d |  d || | | t |	 d |  qR|	t| v rtd| d|	 d|	 d	| d
	 || |	 qR|tkr2|	dkr2td| d|	 d || |	 qR|| | vrLt| d|	 d || |	 qRtd| d|	 d qRq |rtd t| | v r|| | t v r| | t |  D ]"\}}	|dkr||vrtd| d|	 d || |	 q|nUd}
tD ]D}|| | vrq|| | | v rd}
||vrtd| d| d || | qt| d| d| d||  d q|
std| d dd | D }t|}|S )a  
    Given a parsed resources file, language, and possible package
    and/or processors, expands the package to the list of processors

    Returns a list of processors
    Each item in the list of processors is a pair:
      name, then a list of ModelSpecification
    so, for example:
      [['pos', [ModelSpecification(processor='pos', package='gsd', dependencies=None)]],
       ['depparse', [ModelSpecification(processor='depparse', package='gsd', dependencies=None)]]]
    z$Processing parameter "processors"...zProcessor names must be stringsz Processor values must be stringszFound : .optionalz. Using external z variant for the z processor.identityz. Using identity lemmatizer.z: is not officially supported by Stanza, loading it anyway.zCan not find z' from official model list. Ignoring it.z!Processing parameter "package"...FTz is overwritten by zCan not find package: c                    s&   g | ]\ }  fd d|D gqS )c                    s   g | ]	}t  |d dqS )Nr6   )r5   ).0r   keyr%   r&   
<listcomp>&  s    z6maintain_processor_list.<locals>.<listcomp>.<listcomp>r%   )r   plistr%   r   r&   r   &  s   & z+maintain_processor_list.<locals>.<listcomp>)r   listrL   rM   r   r   r   items
isinstancestrra   tupler   PRETRAIN_NAMESr3   r   r   r   r
   r   r   )r   r   r8   r   allow_pretrainmaybe_add_mwtr   r   r   r   flagr%   r%   r&   maintain_processor_list   s   



$  <$$,
 
r   c           
      C   s   | | }|D ]Q}|\}}g }|D ]@}t |jt| v |tko!|jdkgsK||i |ji dg }	dd |	D }	|jt|	d}td|	||j |	| qt||d< q|S )a  
    Expand the processor_list as given in maintain_processor_list to have the dependencies

    Still a list of model types to ModelSpecifications
    the dependencies are tuples: name and package
    for example:
    [['pos', (ModelSpecification(processor='pos', package='gsd', dependencies=(('pretrain', 'gsd'),)),)],
     ['depparse', (ModelSpecification(processor='depparse', package='gsd', dependencies=(('pretrain', 'gsd'),)),)]]
    r   r9   c                 S   s   g | ]
}|d  |d fqS )modelr8   r%   r   
dependencyr%   r%   r&   r   ?  s    z$add_dependencies.<locals>.<listcomp>)r9   z/Found dependencies %s for processor %s model %s   )
anyr8   r   r
   rs   _replacer   rL   rM   r   )
r   r   r   lang_resourcesr   r7   model_specsnew_model_specs
model_specr9   r%   r%   r&   add_dependencies*  s    
r   c           	      C   s   g }g }| D ]#}|\}}|D ]}|j }|j}|||g |r(|dd |D 7 }qqdd t|D }|D ]\}}td| d| d q5||7 }|S )z
    The flattened processor list is just a list of types & packages

    For example:
      [['pos', 'gsd'], ['depparse', 'gsd'], ['pretrain', 'gsd']]
    c                 S      g | ]}t |qS r%   )r   r   r%   r%   r&   r   V      z*flatten_processor_list.<locals>.<listcomp>c                 S   r   r%   )r   )r   r   r%   r%   r&   r   W  r   zFind dependency r   r   )r8   r9   r   setrL   rM   )	r   flattened_processor_listdependencies_listr   r7   r   r   r8   r9   r%   r%   r&   flatten_processor_listF  s"   r   c                 C   s   |dkrd} n|dkrd} | d u rt jdkrt d t jS |  } g d}| |vr8td|  dd	| d
t |  t jS )NFERRORTINFOr   )DEBUGr   WARNINGWARNr   CRITICALFATALz)Unrecognized logging level for pipeline: z. Must be one of , r   )rL   rq   setLevelupperra   rQ   )logging_levelr}   
all_levelsr%   r%   r&   set_logging_level]  s(   


r   c                    s  t | tr|   } n| d urtdt| j dt |tr%| }n|d ur4tdt|j dt |tttfrd u rGt	dd n(t trW t	 fddnt t
rdt	dd ntdtj dt |tr~d	d
 |dD }fdd|D }d nt |t
rdd | D }n|d urtdt|j dt tr  nd urtdtj d| ||fS )Nz,The parameter 'lang' should be str, but got z	 instead.z1The parameter 'model_dir' should be str, but got c                   S      dS Nr   r%   r%   r%   r%   r&   <lambda>      z-process_pipeline_parameters.<locals>.<lambda>c                      s    S )Nr%   r%   )r   r%   r&   r     r   c                   S   r   r   r%   r%   r%   r%   r&   r     r   z>The parameter 'package' should be None, str, or dict, but got c                 S      g | ]}|   qS r%   striplower)r   xr%   r%   r&   r         z/process_pipeline_parameters.<locals>.<listcomp>,c                    s   i | ]}| | qS r%   r%   )r   r7   )r8   r%   r&   
<dictcomp>  s    
z/process_pipeline_parameters.<locals>.<dictcomp>c                 S   sB   i | ]\}}|   t|ttfrd d |D n|   qS )c                 S   r   r%   r   )r   v_ir%   r%   r&   r     r   z:process_pipeline_parameters.<locals>.<dictcomp>.<listcomp>)r   r   r   r   r   )r   kvr%   r%   r&   r     s    0z:The parameter 'processors' should be dict or str, but got zSThe parameter 'package' should be str, or a dict if 'processors' is a str, but got )r   r   r   r   	TypeErrortyper(   r   r   r   dictr   r   )r   	model_dirr8   r   r%   )r   r8   r&   process_pipeline_parametersw  sl   








r   c                 C   sn   |t kr|durt| }| dv rt}| d| d}td| |du r-tj| d}t	|||dd dS )	z=
    Downloads resources.json to obtain latest packages.
    N)stanfordstanfordnlpz/resources_z.jsonz!Downloading resource file from %sresources.jsonT)rt   )
DEFAULT_RESOURCES_URLSTANZA_RESOURCES_GITHUBr   STANFORDNLP_RESOURCES_URLrL   rM   rP   r=   rQ   r   )r   resources_urlresources_branchresources_versionr!   rh   r%   r%   r&   download_resources_json  s   	
r   c                 C   sf   |du rt j| d}t j|st|t|dd}t|}W d   |S 1 s,w   Y  |S )zA
    Unpack the resources json file from the given model_dir
    Nr   zutf-8)encoding)rP   r=   rQ   r[   r   r@   jsonload)r   r!   rG   r   r%   r%   r&   load_resources_json  s   
r   c                 C   s8   || vrdS | | }d|v r|d }| | }d|v s|S )zi
    Get the resources for a lang from an already loaded resources json, following 'alias' if needed
    Naliasr%   )r   r   r   r%   r%   r&   get_language_resources  s   r   c                    s:   t | |||d|d t|   fdd D }t|}|S )z<
    List the non-alias languages in the resources file
    Nr!   rh   c                    s*   g | ]}t  | tsd  | vr|qS )r   )r   r   )r   r   r   r%   r&   r     s    z,list_available_languages.<locals>.<listcomp>)r   r   sorted)r   r   r   r   rh   	languagesr%   r   r&   list_available_languages  s
   r   c                 C   s   |  dkr
| d S |S )za
    Returns the url in the resources dict if model_url is default, or returns the model_url
    r   r|   )r   )r   	model_urlr%   r%   r&   expand_model_url  r]   r   c                 C   s  | |i  d|}tddg| }	|rtj}
ntj}
|
d| d| d|	  t||}| D ]P\}}z4t|j||| d| dd	tj	
|||| d||| | | d
 ||| | |  dd d W q. ty~ } ztd| d| d|d }~ww d S )N	lang_name	ProcessorPackagez4Downloading these customized packages for language:  (z)...
rK   z.ptr   r   rC   rE   rb   )rE   r   rb   z@Cannot find the following processor and model name combination: r   z;. Please check if you have provided the correct model name.)rs   r   rL   r   rM   r   r   formatrP   r=   rQ   KeyErrorra   )download_listr   r   r   r   r   rh   r   r   download_tablelog_msgr|   r   r   rI   r%   r%   r&   download_models  sL   

r  enc              
   C   s  t || t| |||\} }}}|stjtj|ds.|s$td t||||d |
d t	|}| |vr:t
| d||  v rVtd|  d||  d  d ||  d } || i d| }t||	}|dkr|d u sst|d	krtd
|  d| d t|j|| ddtj|| d|
||  d d ttj|| d nt|| ||dd}t|| |}t|}t||| |||	|
dd td|  d S )Nr   zZAsked to skip downloading resources.json, but the file does not exist.  Downloading anywayr   r   "z" is an alias for "r   r   r   z+Downloading default packages for language: r   z) ...zdefault.zipr   default_md5)rE   T)r   )r   r   r   r   r   r   rh   r   z)Finished downloading models and saved to )r   r   rP   r=   r[   rQ   rL   r   r   r   r-   r   rs   r   rV   r   r   rT   r   r   r   r  )r   r   r8   r   r   r}   r   r   r   r   rh   download_jsonr   r   r|   r   r%   r%   r&   download+  sP   

 

r  )NN)F)NNFTN)FT)M__doc__collectionsr   r   r_   rD   r   loggingrP   pathlibr   rr   shutilr   rN   	tqdm.autor   stanza.utils.helper_funcr   stanza.pipeline._constantsr   r   r	   r
   r   r   r   stanza.pipeline.registryr   r   !stanza.resources.default_packagesr   stanza._versionr   	getLoggerrL   r   homeHOME_DIRr   r   getenvr   DEFAULT_RESOURCES_VERSIONDEFAULT_MODEL_URLr=   rQ   DEFAULT_MODEL_DIRr   r^   r   ra   r-   r3   r5   r>   rJ   rT   rZ   r\   rd   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r  r%   r%   r%   r&   <module>   s    $
	



bA

	
(