o
    hf                     @   s(   d Z dd Zdd Zdd Zdd Zd	S )
aM  
Read files of parses and the files which define the train/dev/test splits

Write out the files after splitting them

Sequence of operations:
  - read the raw lines from the input files
  - read the recommended splits, as per the ALT description page
  - separate the trees using the recommended split files
  - write back the trees
c                 C   s   t | dd}| }W d   n1 sw   Y  dd |D }dd |D }tdd |D r9td	| tf td
d |D }|S )z
    Read a split file for ALT

    The format of the file is expected to be a list of lines such as
    URL.1234    <url>
    Here, we only care about the id

    return: a set of the ids
    utf-8encodingNc                 S      g | ]}|  qS  strip.0xr   r   i/var/www/html/env_mimamsha/lib/python3.10/site-packages/stanza/utils/datasets/constituency/convert_alt.py
<listcomp>       z#read_split_file.<locals>.<listcomp>c                 S   s   g | ]
}|r|  d  qS )    )splitr   r   r   r   r      s    c                 s   s    | ]	}| d  V  qdS )zURL.N)
startswithr   r   r   r   	<genexpr>   s    z"read_split_file.<locals>.<genexpr>zUnexpected line in %s: %sc                 s   s$    | ]}t |d dd V  qdS ).   N)intr   r   r   r   r   r      s   " )open	readlinesany
ValueErrorr
   set)
split_filefinlinesr   r   r   r   read_split_file   s   

r   c                 C   sx   dd |D }| D ]0}|j dd\}}t| ddd }t|D ]\}}||v r2|| |  nq!td| q	|S )z
    Splits lines of the form
    SNT.17873.4049	(S ...
    then assigns them to a list based on the file id in
    SNT.<file>.<sent>
    c                 S   s   g | ]}t  qS r   )list)r	   _r   r   r   r   '   s    zsplit_trees.<locals>.<listcomp>r   )maxsplitr      z/Couldn't find which split this line goes in:
%s)r   r   	enumerateappendr   )	all_linessplitstreeslinetree_id	tree_text	split_idxr   r   r   r   split_trees    s   r+   c              	   C   s   g }| D ]}t |dd}||  W d   n1 sw   Y  qdd |D }dd |D }t|}dd |D }t|}||k rOtd||   |}d	d |D }t|}||k rhtd
||   |}|S )a  
    Read the trees from the given file(s)

    Any trees with wide spaces are eliminated.  The parse tree
    handling doesn't handle it well and the tokenizer won't produce
    tokens which are entirely wide spaces anyway

    The tree lines are not processed into trees, though
    r   r   Nc                 S   r   r   r   r   r   r   r   r   B   r   z"read_alt_lines.<locals>.<listcomp>c                 S   s   g | ]}|r|qS r   r   r   r   r   r   r   C   r   c                 S      g | ]}d |vr|qS )u   　r   r   r   r   r   r   F       z0Eliminated %d trees for having wide spaces in itc                 S   r,   )z\xr   r   r   r   r   r   K   r-   z3Eliminated %d trees for not being correctly encoded)r   extendr   lenprint)input_filesr$   
input_filer   original_count	new_countr   r   r   read_alt_lines4   s(   
r5   c           
   	   C   s   t | }dd |D }t||}t||D ]2\}}tdt||f  t|ddd}|D ]
}	|d|	 q-W d   n1 sBw   Y  qdS )	z
    Convert the ALT treebank into train/dev/test splits

    input_files: paths to read trees
    split_files: recommended splits from the ALT page
    output_files: where to write train/dev/test
    c                 S   s   g | ]}t |qS r   )r   )r	   r   r   r   r   r   \   r   zconvert_alt.<locals>.<listcomp>zWriting %d trees to %swr   r   z
(ROOT {})
N)r5   r+   zipr0   r/   r   writeformat)
r1   split_filesoutput_filesr$   r%   r&   chunkoutput_filefouttreer   r   r   convert_altR   s   
r@   N)__doc__r   r+   r5   r@   r   r   r   r   <module>   s
    