-
Notifications
You must be signed in to change notification settings - Fork 65
/
Copy pathcrawl_reviews.py
40 lines (35 loc) · 1.37 KB
/
crawl_reviews.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
import os
import time
import pandas as pd
from tqdm import tqdm
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
driver = webdriver.Edge('msedgedriver.exe')
df = pd.read_csv('paperlist.tsv', sep='\t', index_col=0)
ratings = dict()
decisions = dict()
for paper_id, link in tqdm(list(df.link.items())):
try:
driver.get(link)
xpath = '//div[@id="note_children"]//span[@class="note_content_value"]/..'
cond = EC.presence_of_element_located((By.XPATH, xpath))
WebDriverWait(driver, 60).until(cond)
elems = driver.find_elements_by_xpath(xpath)
assert len(elems), 'empty ratings'
ratings[paper_id] = pd.Series([
int(x.text.split(': ')[1]) for x in elems if x.text.startswith('Rating:')
], dtype=int)
decision = [x.text.split(': ')[1] for x in elems if x.text.startswith('Decision:')]
decisions[paper_id] = decision[0] if decision else 'Unknown'
except KeyboardInterrupt:
break
except Exception as e:
print(paper_id, e)
ratings[paper_id] = pd.Series(dtype=int)
decisions[paper_id] = 'Unknown'
df = pd.DataFrame(ratings).T
df['decision'] = pd.Series(decisions)
df.index.name = 'paper_id'
df.to_csv('ratings.tsv', sep='\t')