Translator.py

# -*- coding: utf-8 -*-
"""1424mt.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/15onU8NVEipMCXHbm8nNhzLJg5KvInWwq
"""

# !pip3 install demoji
# !pip3 install translators
# !pip3 install demoji
# !pip3 install urllib3

import json
import re
# from urllib3 import HTTPError
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')

import demoji
import translators as ts

#removing stop words and preprocessing
def remove_stopwords(text, lang):
    text_wo_stopwords = []
    # remove stopwords
    if lang == 'en':
        stop_words = set(stopwords.words('english'))
    else:
        stop_words = set(stopwords.words('spanish'))
    text = re.split('\s', text)
    for t in text:
        if t not in stop_words:
            text_wo_stopwords.append(t)
    text_wo_stopwords = ' '.join(text_wo_stopwords)
    return text_wo_stopwords


# Function to preprocess tweets - TO-DO: Clean emojis
def preprocess(raw_tweet, lang):
    text = raw_tweet.lower()  # convert to lowercase
    text = re.sub('\n', ' ', text)  # remove '\n'
    text = re.sub(r"http\S+", "", text)  # remove urls
    text = re.sub('#', ' ', text)  # remove '#' but leave text from hashtag
    text = re.sub('@[a-zA-Z]+', ' ', text)  # remove mentions
    text = re.sub('^rt ', ' ', text)  # remove 'rt'
    text = re.sub('[,\.\:\!¡\?\¿\_–-\’\$%|]', ' ', text)  # remove punctuation
    text = re.sub('[0-9]+', ' ', text)  # remove numbers
    text = re.sub('\s+', ' ', text)  # remove extra whitespaces
    text = re.sub('^\s+', '', text)  # remove space(s) at start
    text = re.sub('\s+$', '', text)  # remove space(s) at end
    if lang == 'en' or 'es':  # no stopwrods for hindi
        text = remove_stopwords(text, lang)
    # emojis = list(demoji.findall(text).keys()) # in case we want to store emojis
    text = demoji.replace(text, '')

    return text

#opening the tweet file chunk1
for i in range(1,60):
  tweetfile = open('/content/twitter_data_split'+str(i)+'.json')
  tweets = json.load(tweetfile)

#looping through tweets to translate

  tweets_translated = []
  for tweet in tweets:
    if tweet['tweet_lang'] == 'en':
      if tweet['tweet_text']:
        # print(f'en tweet is',tweet)
        tweet['tweet_text'] = preprocess(tweet['tweet_text'], tweet['tweet_lang'])
        tweet['text_es'] = ts.google(tweet['tweet_text'], from_language='en', to_language='es')
        tweet['text_hi'] = ts.google(tweet['tweet_text'], from_language='en', to_language='hi')
    elif tweet['tweet_lang'] == 'hi':
      if tweet['tweet_text']:
        # print(f'hi tweet is',tweet)
        tweet['tweet_text'] = preprocess(tweet['tweet_text'], tweet['tweet_lang'])
        tweet['text_es'] = ts.google(tweet['tweet_text'], from_language='hi', to_language='es')
        tweet['text_en'] = ts.google(tweet['tweet_text'], from_language='hi', to_language='en')
    elif tweet['tweet_lang'] == 'es':
      if tweet['tweet_text']:
        # print(f'es tweet is', tweet)
        tweet['tweet_text'] = preprocess(tweet['tweet_text'], tweet['tweet_lang'])
        tweet['text_en'] = ts.google(tweet['tweet_text'], from_language='es', to_language='en')
        tweet['text_hi'] = ts.google(tweet['tweet_text'], from_language='es', to_language='hi')
    tweets_translated.append(tweet)
    print(len(tweets_translated))
  j = json.dumps(tweets_translated)
  # for n in range(1,60):
  with open('/content/TR_split_'+str(i)+'.json', 'w') as f:
    f.write(j)