o
    h                     @   s   d Z ddlZddlZddlZddlmZmZ ddlmZm	Z	m
Z
mZmZ ddlmZ dZdd Zdd
dZd	dddZG dd deZdd Zdd ZedkrUe  dS dS )a+  Invokes the Java semgrex on a document

The server client has a method "semgrex" which sends text to Java
CoreNLP for processing with a semgrex (SEMantic GRaph regEX) query:

https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.html

However, this operates on text using the CoreNLP tools, which means
the dependency graphs may not align with stanza's depparse module, and
this also limits the languages for which it can be used.  This module
allows for running semgrex commands on the graphs produced by
depparse.

To use, first process text into a doc using stanza.Pipeline

Next, pass the processed doc and a list of semgrex patterns to
process_doc in this module.  It will run the java semgrex module as a
subprocess and return the result in the form of a SemgrexResponse,
whose description is in the proto file included with stanza.

A minimal example is the main method of this module.

Note that launching the subprocess is potentially quite expensive
relative to the search if used many times on small documents.  Ideally
larger texts would be processed, and all of the desired semgrex
patterns would be run at once.  The worst thing to do would be to call
this multiple times on a large document, one invocation per semgrex
pattern, as that would serialize the document each time.
Included here is a context manager which allows for keeping the same
java process open for multiple requests.  This saves on the subprocess
launching time.  It is still important not to wastefully serialize the
same document over and over, though.
    N)SemgrexRequestSemgrexResponse)send_request	add_tokenadd_word_to_graphJavaProtobufContextconvert_networkx_graph)CoNLLz7edu.stanford.nlp.semgraph.semgrex.ProcessSemgrexRequestc                 C   s   t | ttS N)r   r   SEMGREX_JAVA)request r   P/var/www/html/env_mimamsha/lib/python3.10/site-packages/stanza/server/semgrex.pysend_semgrex_request-   s   r   Fc                 C   s   t  }t|tr|g}|D ]}|j| qt| jD ]4\}}|j }|r.t	|j
|| qd}|jD ]}	|	jD ]}
t|j|
|	 t|j
|
|| |d }q8q3q|S )Nr      )r   
isinstancestrsemgrexappend	enumerate	sentencesqueryaddr   graphtokenswordsr   tokenr   )docsemgrex_patternsenhancedr   r   sent_idxsentencer   word_idxr   wordr   r   r   build_request0   s$   




r$   r   c                G   s   t | ||d}t|S )z
    Returns the result of processing the given semgrex expression on the stanza doc.

    Currently the return is a SemgrexResponse from CoreNLP.proto
    r%   )r$   r   )r   r   r   r   r   r   r   process_docG   s   r&   c                       s*   e Zd ZdZd fdd	Zdd Z  ZS )Semgrexz
    Semgrex context window

    This is a context window which keeps a process open.  Should allow
    for multiple requests without launching new java processes each time.
    Nc                    s   t t| |tt d S r
   )superr'   __init__r   r   )self	classpath	__class__r   r   r)   X   s   zSemgrex.__init__c                 G   s   t ||}| |S )z[
        Apply each of the semgrex patterns to each of the dependency trees in doc
        )r$   process_request)r*   r   r   r   r   r   r   process[   s   

zSemgrex.processr
   )__name__
__module____qualname____doc__r)   r/   __classcell__r   r   r,   r   r'   Q   s    r'   c              	      s  t | } t|tr|g}g }t| j|jD ]h\ }d}t||jD ]T\}}|dd}t|j	dkr; 
d|  q"d}|j	D ]5}	d|	j j|	jd  jf }
t|	jdkrZd	}n fd
d|	jD }dd| } 
d||
|f  q@q"|r~|  q|r|| _| S )zT
    Put comments on the sentences which describe the matching semgrex patterns
    F
 r   z%# semgrex pattern |%s| did not match!Tz%d:%sr    c                    s,   g | ]}d |j |j j|jd  jf qS )z%s=%d:%sr   )name
matchIndexr   text).0noder!   r   r   
<listcomp>w   s    $z annotate_doc.<locals>.<listcomp>z  z&# semgrex pattern |%s| matched at %s%s)copydeepcopyr   r   zipr   resultreplacelenmatchadd_commentr9   r   r:   r<   joinr   )r   semgrex_resultr   matches_onlymatching_sentencesgraph_resultsentence_matchedsemgrex_patternpattern_resultrE   
match_wordnode_matchesr   r=   r   annotate_docb   s6   



	
rQ   c                  C   sl  t  } | jdtddd | jdtddgdd	 | jd
tddd | jdddddd | jddddd | jddddd | jddddd |  }|jrlt|j}dd | D |_W d   n1 sgw   Y  |j	rxt
j|j	dd}ntjddd }|d!}|jrtd"| t  td# t  t|g|jR d$|ji}t|||j|j}td"| dS )%a2  
    Runs a toy example, or can run a given semgrex expression on the given input file.

    For example:
    python3 -m stanza.server.semgrex --input_file demo/semgrex_sample.conllu

    --matches_only to only print sentences that match the semgrex pattern
    --no_print_input to not print the input
    z--input_fileNz<Input file to process (otherwise will process a sample text))typedefaulthelpr   *z{}=source >obj=zzz {}=targetzKSemgrex to apply to the text.  The default looks for sentences with objects)rR   nargsrS   rT   z--semgrex_filezFile to read semgrex patterns from - relevant in case the pattern you want to use doesn't work well on the command line, for examplez--print_inputprint_input
store_trueFz9Print the input alongside the output - gets kind of noisy)destactionrS   rT   z--no_print_inputstore_falsez?Don't print the input alongside the output - gets kind of noisy)rY   rZ   rT   z--matches_onlyz!Only print the matching sentences)rZ   rS   rT   z
--enhancedz2Use the enhanced dependencies instead of the basicc                 S   s   g | ]
}|  r|  qS r   )strip)r;   xr   r   r   r>      s    zmain.<locals>.<listcomp>)
input_fileignore_gappingenztokenize,pos,lemma,depparse)
processorsz1Uro ruined modern.  Fortunately, Wotc banned him.z{:C}zK---------------------------------------------------------------------------r   )argparseArgumentParseradd_argumentr   
parse_argssemgrex_fileopen	readlinesr   r^   r	   	conll2docstanzaPipelinerW   printformatr&   r   rQ   rI   )parserargsfinr   nlprH   r   r   r   main   s2   
rr   __main__)F)r3   rb   r?   rj   stanza.protobufr   r   $stanza.server.java_protobuf_requestsr   r   r   r   r   stanza.utils.conllr	   r   r   r$   r&   r'   rQ   rr   r0   r   r   r   r   <module>   s"    "

 '
