o
    hG
                     @   s   d Z ddlZddlZddlZddlmZ ddlm  m  m	  m
Z
 dd Zdd Zdd	 Zed
krJejd Zejd Zejd Zeeee dS dS )ax  
Airline tweets from Kaggle
from https://www.kaggle.com/crowdflower/twitter-airline-sentiment/data#
Some ratings seem questionable, but it doesn't hurt performance much, if at all

Files in the airline repo are csv, with quotes in "..." if they contained commas themselves.

Accordingly, we use the csv module to read the files and output them in the format
<class> <sentence>

Run using 

python3 convert_airline.py Tweets.csv train.json

If the first word is an @, it is removed, and after that, leading @ or # are removed.
For example:

@AngledLuffa you must hate having Mox Opal #banned
-> 
you must hate having Mox Opal banned
    N)SentimentDatumc           	      C   s   t j| d}t|dd}tj|ddd}t|}W d    n1 s$w   Y  g }|dd  D ]2}|d }|dkr>d	}n|d
krEd}n|dkrLd}ntd||d 	dd}|
t|| q1|S )Nz
Tweets.csv )newline,")	delimiter	quotechar   negative0neutral1positive2zUnknown sentiment: {}
   
 )ospathjoinopencsvreaderlist
ValueErrorformatreplaceappendr   )	in_directoryin_filenamefincinlinesphrasesline	sentiment	utterance r'   j/var/www/html/env_mimamsha/lib/python3.10/site-packages/stanza/utils/datasets/sentiment/process_airline.pyget_phrases   s$   
r)   c                 C   s6   t | }t|}dd |D }tdt| |S )Nc                 S   s    g | ]}t |jt|jqS r'   )r   r%   process_utilsclean_tokenized_tweettext).0xr'   r'   r(   
<listcomp>8   s     z)get_tokenized_phrases.<locals>.<listcomp>z&Found {} phrases in the airline corpus)r)   r*   get_ptb_tokenized_phrasesprintr   len)r   r#   r'   r'   r(   get_tokenized_phrases5   s
   
r3   c                 C   s8   t | }tj|dd tj|d| }t|| d S )NT)exist_okz%s.train.json)r3   r   makedirsr   r   r*   
write_list)r   out_directory
short_namer#   out_filenamer'   r'   r(   main<   s   r:   __main__r	         )__doc__r   r   sysstanza.models.classifiers.datar   -stanza.utils.datasets.sentiment.process_utilsutilsdatasetsr%   r*   r)   r3   r:   __name__argvr   r7   r8   r'   r'   r'   r(   <module>   s    


