o
    h                     @   s   d dl mZ d dlZd dlZd dlZd dlm  mZ d dl	m
Z
 d dlmZ d dlmZ e ZG dd deZdd	 Zd
d Zdd Zdd Zdd ZedZedZdd ZdS )    )EnumN)tree_reader)write_dataset)get_tqdmc                   @   s   e Zd ZdZdZdZdS )Version         N)__name__
__module____qualname__V51V51bV90 r   r   i/var/www/html/env_mimamsha/lib/python3.10/site-packages/stanza/utils/datasets/constituency/convert_ctb.pyr      s    r   c                 C   s   | dkr
| dkr
dS | dkr| dkrdS | dkr| dkrdS | dkr(| d	kr(dS | d
kr2| dkr2dS | dkr<| dkr<dS t d|  )Nr   i/  r     ip  iv    |    i0  iu  r   iq  {  Unhandled filenum %d
ValueErrorfilenumr   r   r   filenum_to_shard_51   s   r   c                 C   sp   | dkr
| dkr
dS | dkr| dkrdS | dkr| dkrdS | dkr(| d	kr(d
S | dkr2| dkr2d S t d|  )Nr   i  r   i  r   i-  iE  i  ,  r   i  i  r   r   r   r   r   r   filenum_to_shard_51_basic%   s   r   c                 C   s  | dkr
| dkr
dS | dkr| dkrdS | dv rdS | dkr$| dkr$dS | d	kr.| d
kr.dS | dkr8| dkr8dS | dkrB| dkrBdS | dkrL| dkrLdS | dkrV| dkrVdS | dkr`| dkr`dS | dkrj| dkrjdS | dkrt| dkrtdS | dkr~| dkr~dS | dkr| dkrdS | dkr| dkrdS | dkr| dkrd S | d!kr| d"krd S | d#v rd S | d$kr| d%krd S | d&kr| d'krd S | d(kr| d)krd S | d*kr| d+krd S | d,kr| d-krd S | d.kr| d/krd S | d0kr| d1krd S | d2kr| d3krd S | d4kr| d5krd S | d6kr| d7krd S | d8kr | d9kr d S | d:kr,| d;kr,d S | d<kr8| d=kr8d S | d>krD| d?krFd S d S d S )@Nr   (   r   i  r   )i  i  i  i  i$  i%  i0  i^  i_  il  iu  iv  r   iu  i  i  i	  i

  i*
  i
  i  i&  iI  )   P   i`  ii  i\  io  i  i  i	  i	
  i
  i
  i  i%  Q   r   r   i  )i  ij  ik  i  i  i  i  i  i#  i&  i/  i1  i]  im  it  iw  r   i}  i[  ip  it  i  i  i	  i	  i+
  i
  i  i    i  r   r   r   r   r   filenum_to_shard_906   s   r%   c                 c   s@    | j dkr| j| jd fV  | D ]}t|D ]}|V  qqd S )NSID)tagtextattribcollect_trees_srootchildtreer   r   r   r+   |   s   
r+   c                 c   st    | j dkrt| j dkr| jd fV  | j dkr)t| j dkr)| jd fV  | D ]}t|D ]}|V  q1q+d S )NTEXTr   TURN)r(   lenr)   stripcollect_trees_textr,   r   r   r   r4      s   r4   z<S ID=([0-9a-z]+)>z<(su|msg) id=([0-9a-zA-Z_=]+)>c                    s  t  tj| d}g g g g}g }|D ] }tj|d }ttj|d dd  | |f q|  t	|D ]\ }	|t
jt
jfv ret|	ddd}
|
 }W d    n1 s_w   Y  n|t
ju rt|	dd	}
|
 }W d    n1 sw   Y  |d
dkr|ddk r|d
d} dv r|dddd} dkrۈ dkr|ddkr|dddd}n|ddk rd| }n
|dddd}d| } dkr dks d kr d!ks d"kr d#krtd|} d$v r
|d%d&d'd(}d)| }ntd*| td+|}|d,d-}zt|}W n tyD } zt|d d.  td/|	 |d }~ww d0d1 t|D }|t
ju rdt|dkrdd2d1 t|D }|t
jt
jfv rw fd3d1|D }nd4d1 |D }d5|}z	tj|d6d7}W n ty } zt|d d8  td9|	 d }~ww d:d1 |D }t|dksJ d;|	 |t
ju rt  }n|t
ju rt! }nt" }|d u rq<|| #| q<t$||| d S )<N*r   r   _ignoregb2312)errorsencodingzutf-8)r:   z<TURN>z</TURN> )im  ip  i  z<)z&lt;)z>)z&gt;)r#   i;  z<segmentz<segment id=z<S ID=z
</segment>z</S>z<segz<TEXT>
%s</TEXT>
z<seg id=z</seg>z<foo>
%s</foo>
i  i  ip  i,  iX  r$   )i  i5  <z&lt;>z&gt;z<foo><TEXT>
%s</TEXT></foo>
zUnknown CTB version %sz<S ID="\1">&z&amp;i  zCannot xml process %sc                 S      g | ]}|qS r   r   .0xr   r   r   
<listcomp>       zconvert_ctb.<locals>.<listcomp>c                 S   r?   r   r   r@   r   r   r   rC      rD   c                    s(   g | ]} d ks|d dkr|d qS )i  r   4366r   r   r@   r   r   r   rC      s   ( c                 S   s   g | ]}|d  qS )r   r   r@   r   r   r   rC      s    
F)use_tqdmr   z%Could not process the tree text in %sc                 S   s   g | ]}|   qS r   )
prune_nonesimplify_labels)rA   tr   r   r   rC      s    zNo trees in %s)%globospathjoinsplitintsplitextappendsorttqdmr   r   r   openreadr   findreplacesu_resubr   id_reET
fromstring	ExceptionprintRuntimeErrorr+   r2   r4   r   
read_treesr   r   r%   extendr   )	input_dir
output_dirdataset_nameversioninput_filesdatasetssorted_filenamesinput_filenamebase_filenamefilenamefinr)   xml_rootetreesshardr   r   r   convert_ctb   s   



4




rr   )enumr   rK   rL   rexml.etree.ElementTreeetreeElementTreer\   stanza.models.constituencyr   (stanza.utils.datasets.constituency.utilsr   stanza.utils.get_tqdmr   rT   r   r   r   r%   r+   r4   compiler[   rY   rr   r   r   r   r   <module>   s$    F

