-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgetdata.py
81 lines (65 loc) · 2.75 KB
/
getdata.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import re
import requests as r
from fake_useragent import UserAgent
import json
import datetime
#Unfixed feature: you should delete last "," in output
def news():
# generating time-depending filename, so that the data is not overwritten when you run it again
now = datetime.datetime.now()
filename = "data" + str(now.minute) + str(now.second) + ".json"
# habr "kek api v2" gives away articles in html
def cleanhtml(raw_html):
cleanr = re.compile('<.*?>')
cleantext = re.sub(cleanr, '', raw_html)
return cleantext.replace("\r\n", "")
# get_page gets news page in json
def get_page(num):
API_URL = 'https://m.habr.com/kek/v2/articles?fl=ru&hl=ru&news=true&page=%d' % (
num)
#added random-useragent
response = r.get(API_URL, headers={'User-Agent': UserAgent().random})
if response.status_code == 200:
json = response.json()
return json
# full_text get full text of article (get_page gives just teaser)
def full_text(id):
API_URL = 'https://m.habr.com/kek/v2/articles/%d' % (id)
response = r.get(API_URL)
if response.status_code == 200:
json = response.json()
text = json['textHtml']
return text
# getting pages count to stop 'for' cycle
pages = get_page(1)['pagesCount']
# to have a valid json array
with open(filename, 'a', encoding='utf-8') as f:
f.write("[")
# get and write in file all news
count = 0
for i in range(1, pages + 1):
page = get_page(i)
for ref in page['articleRefs'].values():
piece_of_news = {}
# collect data we need in piece_of_news
piece_of_news['views'] = ref['statistics']['readingCount']
piece_of_news['rating'] = ref['statistics']['score']
piece_of_news['votes'] = ref['statistics']['votesCount']
piece_of_news['favorites'] = ref['statistics']['favoritesCount']
piece_of_news['comments'] = ref['statistics']['commentsCount']
piece_of_news['pubdate'] = ref['timePublished']
piece_of_news['teaser'] = cleanhtml(ref['leadData']['textHtml'])
piece_of_news['header'] = cleanhtml(ref['titleHtml'])
piece_of_news['text'] = cleanhtml(full_text(int(ref['id'])))
# write piece_of_news into file
with open(filename, 'a', encoding='utf-8') as f:
json.dump(piece_of_news, f, ensure_ascii=False, indent=4)
f.write(",")
# to watch progress
count = count + 1
print(count)
# to have a valid json array
with open(filename, 'a', encoding='utf-8') as f:
f.write("]")
print('Data received. And Written.')
news()