o
    h                     @   s   d Z ddlZddlZddlmZ ddlm  m  m  m	Z	 dd Z
dd Zdd	 Zed
krFejd Zejd Zejd Zeeee dS dS )ah  
A small dataset of 1500 positive and 1500 negative sentences.
Supposedly has no neutral sentences by design

https://archive.ics.uci.edu/ml/datasets/Sentiment+Labelled+Sentences

https://archive.ics.uci.edu/ml/machine-learning-databases/00331/

See the existing readme for citation requirements etc

Files in the slsd repo were one line per annotation, with labels 0
for negative and 1 for positive.  No neutral labels existed.

Accordingly, we rearrange the text and adjust the label to fit the
0/1/2 paradigm.  Text is retokenized using PTBTokenizer.

<class> <sentence>

process_slsd.py <directory> <outputfile>
    N)SentimentDatumc                 C   s   t j| dt j| dt j| dg}g }|D ]}|t|dd qg }|D ]9}| }|d }|d d }|dd}|d	d
}|dkrKd}n|dkrRd}ntd||	t
|| q(|S )Nzamazon_cells_labelled.txtzimdb_labelled.txtzyelp_labelled.txt )newlinez!.!z?.?012zUnknown sentiment: {})ospathjoinextendopenstripreplace
ValueErrorformatappendr   )in_directoryin_filenameslinesfilenamephrasesline	sentiment	utterance r   g/var/www/html/env_mimamsha/lib/python3.10/site-packages/stanza/utils/datasets/sentiment/process_slsd.pyget_phrases   s*   r   c                 C   s&   t | }t|}tdt|  |S )NzFound %d phrases in slsd)r   process_utilsget_ptb_tokenized_phrasesprintlen)r   r   r   r   r   get_tokenized_phrases6   s   
r$   c                 C   s8   t | }tj|d| }tj|dd t|| d S )Nz%s.train.jsonT)exist_ok)r$   r   r   r   makedirsr    
write_list)r   out_directory
short_namer   out_filenamer   r   r   main<   s   r+   __main__         )__doc__r   sysstanza.models.classifiers.datar   -stanza.utils.datasets.sentiment.process_utilsutilsdatasetsr   r    r   r$   r+   __name__argvr   r(   r)   r   r   r   r   <module>   s    


