o
    hh	                     @   s   d Z ddlZddlZddlZddlmZ ddlm  m  m	  m
Z
 dd Zdd Zdd	 Zed
krJejd Zejd Zejd Zeeee dS dS )a  
MELD is a dataset of Friends (the TV show) utterances.  

The ratings include judgment based on the visuals, so it might be
harder than expected to directly extract from the text.  However, it
should broaden the scope of the model and doesn't seem to hurt
performance.

https://github.com/SenticNet/MELD/tree/master/data/MELD

https://github.com/SenticNet/MELD

https://arxiv.org/pdf/1810.02508.pdf

Files in the MELD repo are csv, with quotes in "..." if they contained commas themselves.

Accordingly, we use the csv module to read the files and output them in the format
<class> <sentence>

Run using 

python3 convert_MELD.py MELD/train_sent_emo.csv train.txt
etc

    N)SentimentDatumc                 C   s   t | ddd}tj|ddd}t|}W d   n1 sw   Y  g }|dd D ]2}|d	 }|d
kr8d}n|dkr?d}n|dkrFd}ntd||d dd}|t|| q+|S )z4
    Get the phrases from a single CSV filename
     zwindows-1252)newlineencoding,")	delimiter	quotecharN      negative0neutral1positive2zUnknown sentiment: {}   Â)	opencsvreaderlist
ValueErrorformatreplaceappendr   )in_filenamefincinlinesphrasesline	sentiment	utterance r#   g/var/www/html/env_mimamsha/lib/python3.10/site-packages/stanza/utils/datasets/sentiment/process_MELD.pyget_phrases"   s"   
r%   c                 C   s<   t j|d|  }t|}t|}tdt||  |S )z!
    split in train,dev,test
    z%s_sent_emo.csvzFound {} phrases in MELD {})	ospathjoinr%   process_utilsget_ptb_tokenized_phrasesprintr   len)splitin_directoryr   r   r#   r#   r$   get_tokenized_phrases9   s
   
r/   c              	   C   sD   t j|dd dD ]}t|| }tt j|d||f | q	d S )NT)exist_ok)traindevtestz
%s.%s.json)r&   makedirsr/   r)   
write_listr'   r(   )r.   out_directory
short_namer-   r   r#   r#   r$   mainD   s
   
 r8   __main__r
         )__doc__r   r&   sysstanza.models.classifiers.datar   -stanza.utils.datasets.sentiment.process_utilsutilsdatasetsr!   r)   r%   r/   r8   __name__argvr.   r6   r7   r#   r#   r#   r$   <module>   s    


