o
    h                     @   s   d Z ddlZddlZddlZddlmZ ddlm  m  m	  m
Z
 G dd deZdddd	Zdd
dZedkrDed e  dS dS )a  
Processes the SB10k dataset

The original description of the dataset and corpus_v1.0.tsv is here:

https://www.spinningbytes.com/resources/germansentiment/

Download script is here:

https://github.com/aritter/twitter_download

The problem with this file is that many of the tweets with labels no
longer exist.  Roughly 1/3 as of June 2020.

You can contact the authors for the complete dataset.

There is a paper describing some experiments run on the dataset here:
https://dl.acm.org/doi/pdf/10.1145/3038912.3052611
    N)Enumc                   @   s   e Zd ZdZdZdZdS )Split         N)__name__
__module____qualname__TRAIN_DEV_TEST	TRAIN_DEVTEST r   r   h/var/www/html/env_mimamsha/lib/python3.10/site-packages/stanza/utils/datasets/sentiment/process_sb10k.pyr      s    r   210)positiveneutralnegativec              
   C   s  t  }|jdtd dd |jdtd dd |jdtddd |jd	td
dd |jdtddd |jddd tjdd |j| d} t	| j
| j| jdt}tt| t| tj| jdd | jtju rt| j|td| j dtd| j dtd| j df d S | jtju rt| j|td| j dtd| j df d S | jtju rttj| jd| j | d S td| j)Nz--csv_filenamezCSV file to read in)typedefaulthelpz	--out_dirzWhere to write the output filesz--sentiment_columnr   zColumn with the sentimentz--text_columnr   zColumn with the textz--short_namesb10kz$short name to use when writing filesz--splitc                 S   s   t |   S N)r   upper)xr   r   r   <lambda>.   s    zmain.<locals>.<lambda>zHow to split the resulting data)argsdeT)exist_okz%s.train.jsong?z%s.dev.jsong?z%s.test.jsong?zUnknown split method {})argparseArgumentParseradd_argumentstrintr   r
   
parse_argsprocess_utilsread_snippetscsv_filenamesentiment_columntext_columnMAPPINGprintlenrandomshuffleosmakedirsout_dirsplitwrite_splits
short_namer   r   
write_listpathjoin
ValueErrorformat)r   parsersnippetsr   r   r   main&   s@   
"r=   __main__i  r   )__doc__r    r0   r.   enumr   -stanza.utils.datasets.sentiment.process_utilsutilsdatasets	sentimentr&   r   r+   r=   r   seedr   r   r   r   <module>   s     
#

