o
    –h¥  ã                   @   sZ   d dl m  mZ d dlmZ d dlmZ dd„ Zddd	„Z	d
d„ Z
edkr+e
ƒ  dS dS )é    N)Útree_reader)Úutilsc                 C   s&  t | dd}t |¡}W d  ƒ n1 sw   Y  | ¡ }| d¡}|s*tdƒ‚g }|D ]b}|jdkr=td |j¡ƒ‚d}d}d}|D ]"}	|	jdkrO|	}qE|	jd	krW|	}qE|	jd
kr_|	}
qEtd ||	j¡ƒ‚|du st|du st|
du r{td |¡ƒ‚d | 	¡ ¡}d |
 	¡ ¡}| 
||f¡ q.|S )z\
    Convert the CINTIL xml file to id & test

    Returns a list of tuples: (id, text)
    zutf-8)ÚencodingNz{http://nlx.di.fc.ul.pt}corpusz*Unexpected dataset structure : no 'corpus'z {http://nlx.di.fc.ul.pt}sentencezUnexpected sentence tag: {}z{http://nlx.di.fc.ul.pt}idz{http://nlx.di.fc.ul.pt}rawz{http://nlx.di.fc.ul.pt}treez!Unexpected tag in sentence {}: {}zMissing node in sentence {}Ú )ÚopenÚETÚparseÚgetrootÚfindÚ
ValueErrorÚtagÚformatÚjoinÚitertextÚappend)Úinput_filenameÚfinÚdatasetÚcorpusÚtreesÚsentenceÚid_nodeÚraw_nodeÚ
tree_noddeÚnodeÚ	tree_nodeÚtree_idÚ	tree_text© r   úl/var/www/html/env_mimamsha/lib/python3.10/site-packages/stanza/utils/datasets/constituency/convert_cintil.pyÚread_xml_file   s8   ÿ




r    çš™™™™™é?çš™™™™™¹?c                 C   sD  t | ƒ}g }g }|D ]S\}}| d¡dkrtdƒ‚| dd¡}| dd¡}d| }t |¡}t|ƒd	kr>td
t|ƒ|f ƒ‚|d }| d¡rM| |¡ q
| d¡dkrXtdƒ‚| |¡ q
t	dt|ƒ ƒ t	dt|ƒ ƒ t
 |||¡\}	}
}t	dt|ƒt|	ƒt|
ƒt|ƒf ƒ ||	 }	t	dt|	ƒt|
ƒt|ƒf ƒ |	|
|fS )z8
    dev_size is the size for splitting train & dev
    z _r   zUnexpected underscorez_)ú)z(A (z(A' (z	(ROOT %s)é   z!Unexpectedly found %d trees in %sÚaTSTSÚTSTSzUnexpected TSTSzRead %d synthetic treeszRead %d natural treesz+Split %d trees into %d train %d dev %d testz%Total lengths %d train %d dev %d test)r    r
   r   Úreplacer   Ú
read_treesÚlenÚ
startswithr   Úprintr   Úsplit_treebank)r   Ú
train_sizeÚdev_sizer   Úsynthetic_treesÚnatural_treesr   r   ÚtreeÚtrain_treesÚ	dev_treesÚ
test_treesr   r   r   Úconvert_cintil_treebank)   s2   

$
r5   c                  C   s   t dƒ} d S )Nz>extern_data/constituency/portuguese/CINTIL/CINTIL-Treebank.xml)r5   )Útreebankr   r   r   ÚmainL   s   r7   Ú__main__)r!   r"   )Úxml.etree.ElementTreeÚetreeÚElementTreer   Ústanza.models.constituencyr   Ú"stanza.utils.datasets.constituencyr   r    r5   r7   Ú__name__r   r   r   r   Ú<module>   s    
##
ÿ