"""
USAGE is produced by the same people as SCARE.  

USAGE has a German and English part.  This script parses the German part.
Run the script as 
  process_usage_german.py path

Here, path should be where USAGE was unpacked.  It will have the
documents, files, etc subdirectories.

https://www.romanklinger.de/usagecorpus/
"""

import csv
import glob
import os
import sys

import stanza

from stanza.models.classifiers.data import SentimentDatum
import stanza.utils.datasets.sentiment.process_utils as process_utils

def main(in_directory, out_directory, short_name):
    os.makedirs(out_directory, exist_ok=True)
    nlp = stanza.Pipeline('de', processors='tokenize')

    num_short_items = 0
    snippets = []
    csv_files = glob.glob(os.path.join(in_directory, "files/de*csv"))
    for csv_filename in csv_files:
        with open(csv_filename, newline='') as fin:
            cin = csv.reader(fin, delimiter='\t', quotechar=None)
            lines = list(cin)

            for index, line in enumerate(lines):
                begin, end, snippet, sentiment = [line[i] for i in [2, 3, 4, 6]]
                begin = int(begin)
                end = int(end)
                if len(snippet) != end - begin:
                    raise ValueError("Error found in {} line {}.  Expected {} got {}".format(csv_filename, index, (end-begin), len(snippet)))
                if sentiment.lower() == 'unknown':
                    continue
                elif sentiment.lower() == 'positive':
                    sentiment = 2
                elif sentiment.lower() == 'neutral':
                    sentiment = 1
                elif sentiment.lower() == 'negative':
                    sentiment = 0
                else:
                    raise ValueError("Tell John he screwed up and this is why he can't have Mox Opal: {}".format(sentiment))
                doc = nlp(snippet)
                text = [token.text for sentence in doc.sentences for token in sentence.tokens]
                num_tokens = sum(len(sentence.tokens) for sentence in doc.sentences)
                if num_tokens < 4:
                    num_short_items = num_short_items + 1
                snippets.append(SentimentDatum(sentiment, text))

    print("Total snippets found for USAGE: %d" % len(snippets))

    process_utils.write_list(os.path.join(out_directory, "%s.train.json" % short_name), snippets)

if __name__ == '__main__':
    in_directory = sys.argv[1]
    out_directory = sys.argv[2]
    short_name = sys.argv[3]

    main(in_directory, out_directory, short_name)
