o
    h#                     @   s.  d Z ddlmZ ddlZddlZddlZddlmZ ddlm	Z	 e	 Z
dZdZedZG d	d
 d
eZG dd deZG dd deZG dd deZdd Zdd ZedZG dd dZG dd deZG dd deZdd Zd+d d!Zd,d"d#Zd,d$d%Zd-d&d'Zd(d) Z e!d*kre   dS dS ).z
Reads ParseTree objects from a file, string, or similar input

Works by first splitting the input into (, ), and all other tokens,
then recursively processing those tokens into trees.
    )dequeN)Tree)get_tqdm()zstanza.constituencyc                           e Zd ZdZ fddZ  ZS )UnclosedTreeErrorz!
    A tree looked like (Foo
    c                       t  d|  || _d S )NzKFound an unfinished tree (missing close brackets).  Tree started on line %dsuper__init__line_numselfr   	__class__ a/var/www/html/env_mimamsha/lib/python3.10/site-packages/stanza/models/constituency/tree_reader.pyr         
zUnclosedTreeError.__init____name__
__module____qualname____doc__r   __classcell__r   r   r   r   r          r   c                       r   )ExtraCloseTreeErrorz#
    A tree looked like (Foo))
    c                    r	   )NzDFound a broken tree (extra close brackets).  Tree started on line %dr
   r   r   r   r   r   &   r   zExtraCloseTreeError.__init__r   r   r   r   r   r   "   r   r   c                       r   )UnlabeledTreeErrorz
    A tree had no label, such as ((Foo) (Bar))

    This does not actually happen at the root, btw, as ROOT is silently added
    c                    r	   )Nz5Found a tree with no label on a node!  Line number %dr
   r   r   r   r   r   0   r   zUnlabeledTreeError.__init__r   r   r   r   r   r   *   s    r   c                       r   )MixedTreeErrorzB
    Leaf and constituent children are mixed in the same node
    c                    s,   t  d||| || _|| _|| _d S )NziFound a tree with both text children and bracketed children!  Line number {}  Child label {}  Children {})r   r   formatr   child_labelchildren)r   r   r    r!   r   r   r   r   8   s   
zMixedTreeError.__init__r   r   r   r   r   r   4   r   r   c                 C   s   |  dd ddS )Nz-LRB-r   z-RRB-r   )replace)textr   r   r   	normalize>   s   r$   c                 C   sh  t  }|g  t  }|g  t| d}|   |dur|tkr,|g  |g  ny|tkr| }| }|rd| }t	|dkrOt
|d |}n.|d }	d|dd }
|rt|rmt
|	|t
t|
g }nt| j|
|t
|	t
t|
}|s|S n|st
d|S |rt
d|}nt| j|d | n|d | t| d}|dust|  )z<
    Build a tree from the tokens in the token_iterator
    N    r   ROOT)r   appendnextset_mark
OPEN_PARENCLOSE_PARENpopjoinsplitlenr   r$   r   r   r   r   get_mark)token_iterator	broken_okchildren_stack
text_stacktokenr#   r!   pieceschildlabelr    r   r   r   read_single_treeA   sJ   






%r;   z([()])c                   @   s8   e Zd ZdZdd Zdd Zdd Zdd	 Zd
d ZdS )TokenIteratorz
    A specific iterator for reading trees from a tree file

    The idea is that this will keep track of which line
    we are processing, so that an error can be logged
    from the correct line
    c                 C   s   t g | _d| _d | _d S )Nr(   )iterr3   r   markr   r   r   r   r      s   

zTokenIterator.__init__c                 C   s   | j | _dS )z`
        The mark is used for determining where the start of a tree occurs for an error
        N)r   r>   r?   r   r   r   r+      s   zTokenIterator.set_markc                 C   s   | j d u r	td| j S )NzNo mark set!)r>   
ValueErrorr?   r   r   r   r2      s   
zTokenIterator.get_markc                 C   s   | S Nr   r?   r   r   r   __iter__   s   zTokenIterator.__iter__c                 C   s   t | jd }|d u rD| jd | _t | j}|d u rt| }|s"qt|}dd |D }dd |D }t|| _t | jd }|d u s
|S )Nr&   c                 S   s   g | ]}|  qS r   )strip.0xr   r   r   
<listcomp>       z*TokenIterator.__next__.<locals>.<listcomp>c                 S   s   g | ]}|r|qS r   r   rD   r   r   r   rG      rH   )	r*   r3   r   line_iteratorStopIterationrC   LINE_SPLIT_REr0   r=   )r   nliner8   r   r   r   __next__   s    


zTokenIterator.__next__N)	r   r   r   r   r   r+   r2   rB   rN   r   r   r   r   r<   x   s    r<   c                       s   e Zd Zd fdd	Z  ZS )TextTokenIteratorTc                    sT   t    |d| _t| j| _| jdkr"|r"tt| j| _d S t| j| _d S )N
  )	r   r   r0   linesr1   	num_linesr=   tqdmrI   )r   r#   use_tqdmr   r   r   r      s   
zTextTokenIterator.__init__)T)r   r   r   r   r   r   r   r   r   rO      s    rO   c                       s,   e Zd Z fddZdd Zdd Z  ZS )FileTokenIteratorc                    s   t    || _d S rA   )r   r   filename)r   rW   r   r   r   r      s   

zFileTokenIterator.__init__c                 C   sx   t | j}tdd |D }W d    n1 sw   Y  t | j| _|dkr4tt| j|d| _| S t| j| _| S )Nc                 s   s    | ]}d V  qdS )r&   Nr   )rE   _r   r   r   	<genexpr>   s    z.FileTokenIterator.__enter__.<locals>.<genexpr>rQ   )total)openrW   sumfile_objr=   rT   rI   )r   finrS   r   r   r   	__enter__   s   zFileTokenIterator.__enter__c                 C   s   | j r
| j   d S d S rA   )r]   close)r   exc_type	exc_valueexc_tbr   r   r   __exit__   s   zFileTokenIterator.__exit__)r   r   r   r   r_   rd   r   r   r   r   r   rV      s    rV   c                 C   s   g }t | d }|rM|tkr;t| |d}|d u rtd| j |d ur0||}|d ur/|| n|| t | d }n|tkrDt| jtd| j |s	|S )N)r4   z8Tree reader somehow created a None tree!  Line number %dz5Tree document had text between trees!  Line number %d)r*   r,   r;   r@   r   r)   r-   r   )r3   r4   tree_callbacktreesr7   	next_treetransformedr   r   r   read_token_iterator   s&   



ri   FTc                 C   s   t | |}t|||dS )zl
    Reads multiple trees from the text

    TODO: some of the error cases we hit can be recovered from
    r4   re   )rO   ri   )r#   r4   re   rU   r3   r   r   r   
read_trees   s   
rk   c                 C   s<   t | }t|||d}W d   |S 1 sw   Y  |S )z1
    Read all of the trees in the given file
    rj   N)rV   ri   )rW   r4   re   r3   rf   r   r   r   read_tree_file   s   

rl   c                 C   s<   g }t t| D ]}tj| |}|t||| q	|S )zB
    Read all of the trees in all of the files in a directory
    )sortedoslistdirpathr/   extendrl   )dirnamer4   re   rf   rW   	full_namer   r   r   read_directory   s
   rt   c                 C   s\   t d|  t| |d}dd |D }dd |D }t|dkr,tdt||d |S )zZ
    Read a treebank and alter the trees to be a simpler format for learning to parse
    zReading trees from %s)re   c                 S   s   g | ]}|   qS r   )
prune_nonesimplify_labelsrE   tr   r   r   rG     s    z!read_treebank.<locals>.<listcomp>c                 S   s   g | ]}t |jd kr|qS )r&   )r1   r!   rw   r   r   r   rG     s    r   zWFound {} tree(s) which had non-unary transitions at the ROOT.  First illegal tree: {:P})loggerinforl   r1   r@   r   )rW   re   rf   illegal_treesr   r   r   read_treebank   s   r|   c                  C   s   d} t | }t| dS )z
    Reads a sample tree
    z[( (SBARQ (WHNP (WP Who)) (SQ (VP (VBZ sits) (PP (IN in) (NP (DT this) (NN seat))))) (. ?)))N)rk   print)r#   rf   r   r   r   main	  s   r~   __main__)FNT)FNrA   )"r   collectionsr   loggingrn   re%stanza.models.constituency.parse_treer   stanza.utils.get_tqdmr   rT   r,   r-   	getLoggerry   r@   r   r   r   r   r$   r;   compilerK   r<   rO   rV   ri   rk   rl   rt   r|   r~   r   r   r   r   r   <module>   s<    



4/

	



