-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathTranslator.py
94 lines (81 loc) · 3.43 KB
/
Translator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
# -*- coding: utf-8 -*-
"""1424mt.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/15onU8NVEipMCXHbm8nNhzLJg5KvInWwq
"""
# !pip3 install demoji
# !pip3 install translators
# !pip3 install demoji
# !pip3 install urllib3
import json
import re
# from urllib3 import HTTPError
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
import demoji
import translators as ts
#removing stop words and preprocessing
def remove_stopwords(text, lang):
text_wo_stopwords = []
# remove stopwords
if lang == 'en':
stop_words = set(stopwords.words('english'))
else:
stop_words = set(stopwords.words('spanish'))
text = re.split('\s', text)
for t in text:
if t not in stop_words:
text_wo_stopwords.append(t)
text_wo_stopwords = ' '.join(text_wo_stopwords)
return text_wo_stopwords
# Function to preprocess tweets - TO-DO: Clean emojis
def preprocess(raw_tweet, lang):
text = raw_tweet.lower() # convert to lowercase
text = re.sub('\n', ' ', text) # remove '\n'
text = re.sub(r"http\S+", "", text) # remove urls
text = re.sub('#', ' ', text) # remove '#' but leave text from hashtag
text = re.sub('@[a-zA-Z]+', ' ', text) # remove mentions
text = re.sub('^rt ', ' ', text) # remove 'rt'
text = re.sub('[,\.\:\!¡\?\¿\_–-\’\$%|]', ' ', text) # remove punctuation
text = re.sub('[0-9]+', ' ', text) # remove numbers
text = re.sub('\s+', ' ', text) # remove extra whitespaces
text = re.sub('^\s+', '', text) # remove space(s) at start
text = re.sub('\s+$', '', text) # remove space(s) at end
if lang == 'en' or 'es': # no stopwrods for hindi
text = remove_stopwords(text, lang)
# emojis = list(demoji.findall(text).keys()) # in case we want to store emojis
text = demoji.replace(text, '')
return text
#opening the tweet file chunk1
for i in range(1,60):
tweetfile = open('/content/twitter_data_split'+str(i)+'.json')
tweets = json.load(tweetfile)
#looping through tweets to translate
tweets_translated = []
for tweet in tweets:
if tweet['tweet_lang'] == 'en':
if tweet['tweet_text']:
# print(f'en tweet is',tweet)
tweet['tweet_text'] = preprocess(tweet['tweet_text'], tweet['tweet_lang'])
tweet['text_es'] = ts.google(tweet['tweet_text'], from_language='en', to_language='es')
tweet['text_hi'] = ts.google(tweet['tweet_text'], from_language='en', to_language='hi')
elif tweet['tweet_lang'] == 'hi':
if tweet['tweet_text']:
# print(f'hi tweet is',tweet)
tweet['tweet_text'] = preprocess(tweet['tweet_text'], tweet['tweet_lang'])
tweet['text_es'] = ts.google(tweet['tweet_text'], from_language='hi', to_language='es')
tweet['text_en'] = ts.google(tweet['tweet_text'], from_language='hi', to_language='en')
elif tweet['tweet_lang'] == 'es':
if tweet['tweet_text']:
# print(f'es tweet is', tweet)
tweet['tweet_text'] = preprocess(tweet['tweet_text'], tweet['tweet_lang'])
tweet['text_en'] = ts.google(tweet['tweet_text'], from_language='es', to_language='en')
tweet['text_hi'] = ts.google(tweet['tweet_text'], from_language='es', to_language='hi')
tweets_translated.append(tweet)
print(len(tweets_translated))
j = json.dumps(tweets_translated)
# for n in range(1,60):
with open('/content/TR_split_'+str(i)+'.json', 'w') as f:
f.write(j)