o
    hu$                     @   s   d Z ddlZddlZddlmZ ddlmZ ddlmZmZm	Z	m
Z
mZmZmZmZmZmZmZmZmZ ddlmZmZ G dd deZG d	d
 d
ZdS )zI
Utility functions for the loading and conversion of CoNLL-format files.
    N)ZipFile)Document)IDTEXTLEMMAUPOSXPOSFEATSHEADDEPRELDEPSMISCNER
START_CHAREND_CHAR)FIELD_TO_IDX	FIELD_NUMc                   @   s   e Zd ZdS )
CoNLLErrorN)__name__
__module____qualname__ r   r   M/var/www/html/env_mimamsha/lib/python3.10/site-packages/stanza/utils/conll.pyr      s    r   c                   @   s   e Zd ZedddZedd Zedd Zedd	 ZedddZedddZ	edddZ
edd ZedddZd
S )CoNLLTc           	      C   s  g g }}g g }}t | D ]a\}}| d}t|dkr4t|dkr3|| g }|| g }q|dr?|| q|d}|rMd|d v rMqt|tkrjtd|d  dt d	t| d
| d| 
||g7 }qt|dkr|| || ||fS )a   Load the file or string into the CoNLL-U format data.
        Input: file or string reader, where the data is in CoNLL-U format.
        Output: a tuple whose first element is a list of list of list for each token in each sentence in the data,
        where the innermost list represents all fields of a token; and whose second element is a list of lists for each
        comment in each sentence in the data.
        z 
	r   #	.zCannot parse CoNLL line    z: expecting z	 fields, z found at line z
  )		enumeratelstriprstriplenappend
startswithsplitr   r   )	fignore_gappingdocsentdoc_commentssent_commentsline_idxlinearrayr   r   r   
load_conll   s0   
	





.

zCoNLL.load_conllc                 C   s   g }g }t | D ]p\}}g }g }t |D ]Y\}}zt|}	W n ty7 }
 ztd||t|
f |
d}
~
ww d|	t v rVtdd |	t jdddD |	t< |	|	 qtdd |	t jd	ddD |	t< |	|	 q|	| |	| q||fS )
ai   Convert the CoNLL-U format input data to a dictionary format output data.
        Input: list of token fields loaded from the CoNLL-U format data, where the outmost list represents a list of sentences, and the inside list represents all fields of a token.
        Output: a list of list of dictionaries for each token in each sentence in the document.
        z*Could not process sentence %d token %d: %sNr   c                 s       | ]}t |V  qd S Nint.0xr   r   r   	<genexpr>E       z&CoNLL.convert_conll.<locals>.<genexpr>r   maxsplitc                 s   r/   r0   r1   r3   r   r   r   r6   H   r7   -)
r   r   convert_conll_token
ValueErrorr   strr   tupler$   r"   )	doc_conlldoc_dict	doc_emptysent_idx
sent_conll	sent_dict
sent_empty	token_idxtoken_conll
token_dicter   r   r   convert_conll4   s(   $$
zCoNLL.convert_conllc                 C   s.   t | }d|}|d}dd |D }|S )a   Convert the dictionary format input data to the CoNLL-U format output data.

        This is the reverse function of `convert_conll`, but does not include sentence level annotations or comments.

        Can call this on a Document using `CoNLL.convert_dict(doc.to_dict())`

        Input: dictionary format data, which is a list of list of dictionaries for each token in each sentence in the data.
        Output: CoNLL-U format data as a list of list of list for each token in each sentence in the data.
        z{:c}z

c                 S   s    g | ]}d d | dD qS )c                 S   s   g | ]}| d qS )r   r$   r3   r   r   r   
<listcomp>\   s    z1CoNLL.convert_dict.<locals>.<listcomp>.<listcomp>
rK   )r4   sentencer   r   r   rL   \   s     z&CoNLL.convert_dict.<locals>.<listcomp>)r   formatr$   )r@   r'   text	sentencesr?   r   r   r   convert_dictN   s
   

zCoNLL.convert_dictc                 C   st   i }t D ]3}| t |  }|dkr|tkrt|||< n|||< | t t  dkr7| t t  |t< | t t  |t< q|S )z Convert the CoNLL-U format input token to the dictionary format output token.
        Input: a list of all CoNLL-U fields for the token.
        Output: a dictionary that maps from field name to value.
        _)r   r
   r2   r   r   )rG   rH   fieldvaluer   r   r   r;   _   s   zCoNLL.convert_conll_tokenNc              	   C   s  t | |grt| |grJ d|r| sJ d|r(t|}t||\}}nW|rat|*}|| }ttj|dd|\}}W d   n1 sLw   Y  W d   n1 s[w   Y  nt| dd}t||\}}W d   n1 szw   Y  t	|\}	}
|	||
fS )zV Load the CoNLL-U format data from file or string into lists of dictionaries.
        z%either use input file or input stringz*must provide input_file if zip_file is setutf-8encodingN)
anyallioStringIOr   r.   r   openTextIOWrapperrJ   )
input_file	input_strr&   zip_fileinfiler?   r)   zinfinr@   rA   r   r   r   
conll2dicts   s$    


zCoNLL.conll2dictc                 C   s(   t j| |||d\}}}t|d ||dS )Nra   rP   commentsempty_sentences)r   re   r   )r_   r`   r&   ra   r@   r)   rA   r   r   r   	conll2doc   s   zCoNLL.conll2docc                 C   sN  t j| |||d\}}}g }g }g }	g }
d }t|||D ]d\}}}|D ]M}|ds/|drp|jdddd }t|dkrA|}n$||krdt|d |	|
d}|d kr\|jD ]}| |_	qT|
| |}nq#|g}|g}	|g}
 nq#|
| |	
| |

| qt|dkrt|d |	|
d}|d kr|jD ]}| |_	q|
| |}|S )	Nrf   z
# doc_id =z# newdoc id ==r   r8   r   rg   )r   re   zipr#   r$   r!   r   rQ   stripdoc_idr"   )r_   r`   r&   ra   r@   r)   rA   docscurrent_doccurrent_commentscurrent_emptycurrent_doc_idr'   rh   emptycommentrn   new_docir   r   r   conll2multi_docs   sJ   






zCoNLL.conll2multi_docsc                 C   s   t | }t|| dS )zq
        Convert the dictionary format input data to the CoNLL-U format output data and write to a file.
        N)r   r   write_doc2conll)r@   filenamer'   r   r   r   
dict2conll   s   zCoNLL.dict2conllwrV   c                 C   sb   t |dr|d|  dS t|||d}|d|  W d   dS 1 s*w   Y  dS )a   
        Writes the doc as a conll file to the given file.

        If passed a string, that filename will be opened.  Otherwise, filename.write() will be called.

        Note that the output needs an extra 

 at the end to be a legal output file
        writez{:C}

rW   N)hasattrr}   rO   r]   )r'   rz   moderX   outfiler   r   r   ry      s
   
	"zCoNLL.write_doc2conll)T)NNTN)r|   rV   )r   r   r   staticmethodr.   rJ   rR   r;   re   rj   rx   r{   ry   r   r   r   r   r      s&    "


)
r   )__doc__osr[   zipfiler   stanza.models.common.docr   r   r   r   r   r   r	   r
   r   r   r   r   r   r   r   r   r<   r   r   r   r   r   r   <module>   s    <