o
    hW                     @   s   d dl Z d dlZd dlZd dlmZ d dlm  m  m  m	Z	 d dl
m  mZ g dZg dgg dg dddgg d	d
Zdd Zdd Zdd Zdd Zdd Zedkr]e  dS dS )    N)SentimentDatum)z	train.txtzdev.txtztest.txtzextra-train.txtzchecked-extra-train.txt
-root_only)-ignore_labels2-remap_labels1=0,2=-1,3=1,4=1)r   r   r   r   r   r   0=0,1=0,2=1,3=2,4=2)r   r   r   )	fiveclassrootbinary
binaryroot
threeclassthreeclassrootc                 G   s   ddd| g}t |dkr|t| }td| tj|dtjtjdd}|jd	}d
d |D }dd |D }dd |D }dd |D }|S )z
    Use the CoreNLP OutputSubtrees tool to convert the input file to a bunch of phrases

    Returns a list of the SentimentDatum namedtuple
    javaz%edu.stanford.nlp.trees.OutputSubtreesz-inputr    Tzutf-8)checkstdoutstderrencoding
c                 S   s   g | ]}|  qS  )strip.0xr   r   f/var/www/html/env_mimamsha/lib/python3.10/site-packages/stanza/utils/datasets/sentiment/process_sst.py
<listcomp>#       z get_subtrees.<locals>.<listcomp>c                 S   s   g | ]}|r|qS r   r   r   r   r   r   r   $   r   c                 S   s   g | ]}|j d dqS )   )maxsplit)splitr   r   r   r   r   %   s    c                 S   s"   g | ]}t |d  |d  qS )r   r   )r   r    r   r   r   r   r   &   s   " )	lenlistprintjoin
subprocessrunPIPEr   r    )
input_fileargscmdresultslinesphrasesr   r   r   get_subtrees   s   r.   c                 C   sV   t |  }tj|d|}tj|st|t|g|R  }tdt	|||  |S )Nr	   zFound {} phrases in SST {} {})
	ARGUMENTSospathr$   existsFileNotFoundErrorr.   r#   formatr!   )datasettreebank_file	input_dir
extra_argsr(   r-   r   r   r   get_phrases)   s   r9   c                 C   s<   t | ||}tj|d| |dd f }t|| dS )zq
    Convert the fiveclass files to a specific format

    Uses the ARGUMENTS specific for the format wanted
    zen_sst.%s.%s.json.r   N)r9   r0   r1   r$   r    process_utils
write_list)r5   r6   r7   
output_dirr-   output_filer   r   r   convert_version3   s    r?   c               
   C   sJ   t  } | jdtdddt d |  }|j	s#t
t |_	|S )zN
    Actually, the only argument used right now is the formats to convert
    sections*z Which transformations to use: {}r   )typenargshelp)argparseArgumentParseradd_argumentstrr4   r$   r/   keys
parse_argsr@   r"   )parserr)   r   r   r   rJ   =   s   "rJ   c                  C   s^   t  } t }tj|d d}|d }tj|dd | jD ]}tD ]	}t	|||| q"qd S )NSENTIMENT_BASEzsentiment-treebankSENTIMENT_DATA_DIRT)exist_ok)
rJ   default_pathsget_default_pathsr0   r1   r$   makedirsr@   TREEBANK_FILESr?   )r)   pathsr7   r=   sectionr6   r   r   r   mainH   s   
rU   __main__)rE   r0   r%   stanza.models.classifiers.datar   -stanza.utils.datasets.sentiment.process_utilsutilsdatasets	sentimentr;   stanza.utils.default_pathsrO   rR   r/   r.   r9   r?   rJ   rU   __name__r   r   r   r   <module>   s,    



