o
    h)                     @   s`  d dl Z d dlmZ d dlZd dlZd dlZd dlZd dlZd dlZd dl	m
Z
 d dlm  m  mZ d dlm  mZ edZedZedZedZejejed  dZG d	d
 d
eZG dd deZd/ddZdd Z dd Z!dd Z"dd Z#dd Z$dd Z%dd Z&dd Z'd0d"d#Z(d$d% Z)d&d' Z*d1d)d*Z+d+d, Z,d2d-d.Z-dS )3    N)Enum)canonical_treebank_namestanzaz^[0-9]+[-][0-9]+z^[0-9]+[-.][0-9]+z^[0-9]+$zconllu_to_text.plc                   @   s    e Zd ZdZdZdZdZdZdS )	ModelType               N)__name__
__module____qualname__	TOKENIZERMWTPOSLEMMADEPPARSE r   r   W/var/www/html/env_mimamsha/lib/python3.10/site-packages/stanza/utils/datasets/common.pyr      s    r   c                       s   e Zd Z fddZ  ZS )UnknownDatasetErrorc                    s   t  | || _d S N)super__init__dataset)selfr   text	__class__r   r   r   $   s   
zUnknownDatasetError.__init__)r   r   r   r   __classcell__r   r   r   r   r   #   s    r   traindevtestc                 C   sr   |D ]4}|  d| d| d}|  d| d| d}t j|s&td| tjdt d| d| d	d
 qdS )z
    Uses the udtools perl script to convert a conllu file to txt

    TODO: switch to a python version to get rid of some perl dependence
    /..gold.conllu.txtz-Cannot convert %s as the file cannot be foundzperl  z > T)shellN)ospathexistsFileNotFoundError
subprocesscheck_outputCONLLU_TO_TXT_PERL)tokenizer_dir
short_nameshardsr   output_conllu
output_txtr   r   r   convert_conllu_to_txt(   s   "r5   c                 C      t j| | d| dS )N-ud-z	-mwt.jsonr)   r*   joinbase_dirr1   r   r   r   r   mwt_name8      r<   c                 C   r6   )Nr$   r%   r8   r:   r   r   r   tokenizer_conllu_name;   r=   r>   c                 C   s<   | d| d| d}t |||}t| |d|d|g d S )Nr#   r7   z
.toklabelsz-oz-m)r<   prepare_tokenizer_datamain)	input_txtinput_conllur0   r1   r   labels_filenamemwt_filenamer   r   r    prepare_tokenizer_dataset_labels>   s   
rE   c              
   C   sx   dD ]7}|  d| d| d}|  d| d| d}z
t ||| || W q ttfy.       td||f   dS )z]
    Given the txt and gold.conllu files, prepare mwt and label files for train/dev/test
    r   r#   r$   r&   r%   zFailed to convert %s to %sN)rE   KeyboardInterrupt
SystemExitprint)r0   r1   r   r4   r3   r   r   r   !prepare_tokenizer_treebank_labelsF   s   rI   c                 C   s   g }g }t | dd>}|D ]}| }t|dkr't|dkr&|| g }q|| qt|dkr@|| W d   |S W d   |S 1 sKw   Y  |S )zh
    Reads a conllu file as a list of list of strings

    Finding a blank line separates the lists
    utf-8encodingr   N)openstriplenappend)filenamesentscacheinfileliner   r   r   read_sentences_from_conlluU   s(   


rV   c                 C   s   g }d}d}t | D ]W\}}|dr|| q
|d}t|d r+|| q
t|d }|d dkrE|d dkr?|}|| q
|dkrQ|}|| q
d	|d< d
|d< |d| q
|dur|du rwd|| d< d|| d< nt||| d< d
|| d< d|| ||< |S )a  
    Possibly add fake dependencies in columns 6 and 7 (counting from 0)

    The conllu scripts need the dependencies column filled out, so in
    the case of models we build without dependency data, we need to
    add those fake dependencies in order to use the eval script etc

    lines: a list of strings with 10 tab separated columns
      comments are allowed (they will be skipped)

    returns: the same strings, but with fake dependencies added
      if columns 6 and 7 were empty
    N#	r      _0r   1dep   root)		enumerate
startswithrP   splitMWT_OR_COPY_REmatchintr9   str)lines	new_linesroot_idx	first_idxline_idxrU   pieces	token_idxr   r   r   maybe_add_fake_dependenciesj   s<   



rn   c                 C   s8   |D ]}t |}|D ]}t|| d q
td| d qd S )N)file )rn   rH   )outfilerR   rg   rU   r   r   r   write_sentences_to_file   s   rr   c                 C   s>   t | ddd}t|| W d    d S 1 sw   Y  d S )NwrJ   rK   )rM   rr   )rQ   rR   rq   r   r   r   write_sentences_to_conllu   s   "rt   FUDBASEc              
   C   s   |  dr| dr| dd } tj|| d| d| }t|}t|dkr5|r3td||dS t|d	kr?|d S t	d
| d|  d| d| d	)a@  
    For a given treebank, dataset, extension, look for the exact filename to use.

    Sometimes the short name we use is different from the short name
    used by UD.  For example, Norwegian or Chinese.  Hence the reason
    to not hardcode it based on treebank

    set fail=True to fail if the file is not found
    	UD_Korean_segNz*-ud-r$   r   zCould not find any treebank files which matched {}
If you have the data elsewhere, you can change the base directory for the search by changing the {} environment variabler   z$Unexpected number of files matched 'r#   z/*-ud-')
ra   endswithr)   r*   r9   globrO   r,   formatRuntimeError)treebank
udbase_dirr   	extensionfailenv_varrQ   filesr   r   r   find_treebank_dataset_file   s   

"r   c                 C   sf   d}d}t |  D ]"}| }|sq
|drq
|d }|d}|d dv r,|d }q
|| dkS )z
    Certain treebanks have proprietary data, so the text is hidden

    For example:
      UD_Arabic-NYUAD
      UD_English-ESL
      UD_English-GUMReddit
      UD_Hindi_English-HIENCS
      UD_Japanese-BCCWJ
    r   rW   r   rX   )rZ   -g      ?)rM   	readlinesrN   ra   rb   )rQ   underscore_counttotal_countrU   rl   r   r   r   mostly_underscores   s   

r   c                 C   s^   d}t | }|D ]}| }|sq	|drq	|d }q	W d   |S 1 s(w   Y  |S )z>
    Count the number of non-blank lines in a conllu file
    r   rW   r   N)rM   rN   ra   )conllu_filecountfinrU   r   r   r   num_words_in_file   s   



r   Tc                    sl   t t d }dd |D }dd |D }|r4 fdd|D } fdd|D } fdd|D }|S )zX
    Looks in udbase_dir for all the treebanks which have both train, dev, and test
    z/UD_*c                 S   s   g | ]
}t j|d  qS )r   )r)   r*   rb   .0tr   r   r   
<listcomp>   s    z$get_ud_treebanks.<locals>.<listcomp>c                 S   s   g | ]}|d kr|qS )zUD_English-GUMRedditr   r   r   r   r   r      s    c                    s,   g | ]}t | d drt | ddr|qS )r    conllur"   )r   r   r   r   r   r      s
    
c                    s"   g | ]}t t| d ds|qS )r    r   )r   r   r   r   r   r   r      s    
c                    sJ   g | ]!}t | d ds!tt | dddks!tt | dddkr|qS )r!   r   r    i  r"   i  )r   r   r   r   r   r   r      s    )sortedr{   )r   filtered	treebanksr   r   r   get_ud_treebanks   s   r   c                  C   s   t  } | jdtddd | S )Nr   +zEWhich treebanks to run on.  Use all_ud or ud_all for all UD treebanks)typenargshelp)argparseArgumentParseradd_argumentrf   )parserr   r   r   build_argparse  s   r   c           	      C   s   t ddtj  t }|d ur|| | }t }g }|j	D ]}|
 dv r7t|d }|| q#t|}|| q#|D ]	}| |||| qCd S )NzDatasets program called with:
r'   )ud_allall_udru   )loggerinfor9   sysargvr   
parse_argsdefault_pathsget_default_pathsr   lowerr   extendr   rP   )	process_treebank
model_typeadd_specific_argsr   argspathsr   r~   ud_treebanksr   r   r   r@   	  s    
r@   )r   )Fru   )Tr   ).r   enumr   r{   loggingr)   rer-   r   +stanza.models.common.short_name_to_treebankr   ,stanza.utils.datasets.prepare_tokenizer_datautilsdatasetsr?   stanza.utils.default_pathsr   	getLoggerr   compileMWT_RErc   INT_REr*   r9   rb   __file__r/   r   
ValueErrorr   r5   r<   r>   rE   rI   rV   rn   rr   rt   r   r   r   r   r   r@   r   r   r   r   <module>   sB   




4

