forked from neilkod/tweetParser
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparse.py
executable file
·99 lines (75 loc) · 2.76 KB
/
parse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
#!/usr/bin/python
""" parses twitter JSON data from stdin. A huge to-do
is to accept input/output filenames. right now, the filenames are hard-coded for testing
purposes """
import re
import simplejson as json
import codecs
import sys
archivedProcessedFile = True
archiveDirectory = 'processed'
compressProcessedFile=False
def writeToLog(logFile,text):
logFile.write(text + '\n')
def getClient(clientText):
urlRegexp = re.compile("\<.*[a|A].*\>(.*)\<.*/[a|A].*\>") #h/t @mattrepl for the regexp.
client = urlRegexp.match(clientText)
if client == None:
client = clientText
else:
client = client.groups()[0]
return client
def parseTweet(jsondata,logFileHandle):
try:
dict=json.loads(jsondata)
# look for a text element. this helps avoid deleted and scrub_geo tweets.
if 'text' in dict.keys():
# check to see if the tweet has a source. This might not be necessary
# i ran into tweets without a source but those might have been scrub_geo tweets
# temporarily commenting out the client portion
"""
if 'source' in dict.keys():
client = getClient(dict['source'])
else:
client = 'Undefined'
# remove linefeeds from the tweets. I'm not sure if this is the best way to handle this.
"""
tweetText = dict['text'].replace('\n',' ').replace('\t',' ').replace('\r',' ')
# tweetText = re.sub('\n','',dict['text'])
# desc=re.sub('\r\n','',dict['user']['description'])
""" build the string that gets written to the file. its in the format
id
timestamp
username(screen_name)
tweet text
"""
tweetId = dict['id']
createdAt = dict['created_at']
screenName = dict['user']['screen_name']
# text = '%d\t%s\t%s\t%s\t%s' % (dict['id'],dict['created_at'],dict['user']['screen_name'],tweetText)
output = '%d\t%s\t%s\t%s' % (tweetId,createdAt,screenName,tweetText)
print output.encode('utf8')
# print text#writeToLog(logFileHandle,text)
except:
None
print "fail"
def parseFile(inputFile):
parsed = 0
cnt = 0
# for the time being, we'll just assume the outputfile will be inputfile.out
# we'll make this more dynamic later.
# if
logFileHandle = codecs.open('output/' + inputFile + '.out' ,'w','utf-8')
# badFileHandle = codecs.open('bad/' + inputFile + '.bad','w','utf-8')
for line in file(inputFile):
parseTweet(line,logFileHandle)
cnt = cnt + 1
print "%s: parsed %d tweets" % (inputFile,cnt)
if archivedProcessedFile :
os.rename(inputFile,archiveDirectory + '/' + inputFile)
if __name__ == '__main__':
logFile='tweets.txt'
logFileHandle = codecs.open(logFile,'a','utf-8')
for line in sys.stdin:
parseTweet(line,logFileHandle)
logFileHandle.close()