-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
112 lines (102 loc) · 4.9 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
from selenium import webdriver
import selenium.common.exceptions as selenium_exceptions
import time
import requests
import os
import pprint
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import WebDriverException
import json
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
def download_pdf(url, dst_path):
r = requests.get(url, stream=True)
with open(dst_path, 'wb') as fd:
for chunk in r.iter_content(2000):
fd.write(chunk)
def get_href_by_xpath(browser, xpath):
try:
pdf_href = browser.find_element_by_xpath(xpath).get_attribute('href')
except EC.NoSuchElementException:
pdf_href = None
return pdf_href
def mkdir_(path):
if os.path.isdir(path):
return
else:
os.mkdir(path)
def waiting_with_xpath_click(browser, xpath):
WebDriverWait(browser, 30).until(EC.presence_of_element_located((By.XPATH, xpath)))
browser.find_element_by_xpath(xpath).click()
# zhengxiawu_list = [{'index': 68, 'title': 'Dense auto-encoder hashing for robust cross-modality retrieval'}]
paper_list = [{'index': 10, 'title': 'self_supervised learning'}]
#browser = webdriver.Chrome()
browser = webdriver.Chrome(executable_path='C:/Program Files (x86)/Google/Chrome/Application/chromedriver')
dst_dir ='C:/Users/l_xho/Desktop/学习日志/Self-supervised Learning'
for paper in paper_list:
this_paper_information = {'title': paper['title']}
# create the folder
this_paper_dir_name = os.path.join(dst_dir, str(paper['index']) + ' ' + paper['title'])
mkdir_(this_paper_dir_name)
browser.get('https://scholar.google.com')
browser.set_window_size(800, 1000)
time.sleep(1)
browser.find_element_by_xpath('//*[@id="gs_hdr_tsi"]').send_keys(paper['title'])
waiting_with_xpath_click(browser, '//*[@id="gs_hdr_tsb"]')
time.sleep(120)
waiting_with_xpath_click(browser, '//*[@id="gs_res_ccl_mid"]/div/div[@class="gs_ri"]/div[3]/a[2]')
time.sleep(4)
this_paper_apa_text = browser.find_element_by_xpath('//*[@id="gs_citt"]/table/tbody/tr[3]/td/div').text
waiting_with_xpath_click(browser, '//*[@id="gs_cit-x"]')
pdf_href = get_href_by_xpath(browser, '//*[@id="gs_res_ccl_mid"]/div/div[1]/div/div/a')
this_paper_information['apa'] = this_paper_apa_text
this_paper_information['pdf_href'] = pdf_href
# click reference paper
waiting_with_xpath_click(browser, '//*[@id="gs_res_ccl_mid"]/div/div[@class="gs_ri"]/div[3]/a[3]')
this_paper_information['reference'] = []
flag = True
# 打上断点,规避人机验证
time.sleep(120)
while flag:
elements = browser.find_elements_by_xpath('//*[@id="gs_res_ccl_mid"]//div[@class="gs_r gs_or gs_scl"]')
ele_length = len(elements)
del elements
for i in range(ele_length):
# elements = browser.find_elements_by_xpath('//*[@id="gs_res_ccl_mid"]//div[@class="gs_r gs_or gs_scl"]')
# element = browser.find_element_by_xpath('//*[@id="gs_res_ccl_mid"]/div[{}]'.format(str(i+1)))
# _a = element.find_element_by_xpath('//h3//a')
_a = browser.find_element_by_xpath('//*[@id="gs_res_ccl_mid"]/div[{}]/div[@class="gs_ri"]/h3/a'.format(str(i+1)))
reference_paper_name = _a.text
print(reference_paper_name)
reference_paper_href = get_href_by_xpath(browser, '//*[@id="gs_res_ccl_mid"]/div[{}]/div[1]/div/div/a'.
format(str(i+1)))
time.sleep(1)
browser.find_element_by_xpath('//*[@id="gs_res_ccl_mid"]/div[{}]/div[@class="gs_ri"]/div[3]/a[2]'.format(str(i+1))).click()
# element.find_element_by_xpath('//a[@class="gs_or_cit gs_nph"]').click()
time.sleep(4)
reference_paper_apa_text = browser.find_element_by_xpath('//div[@id="gs_citt"]/table/tbody/tr[3]/td/div').text
browser.find_element_by_xpath('//*[@id="gs_cit-x"]').click()
this_paper_information['reference'].append({
"name": reference_paper_name,
"href": reference_paper_href,
"apa": reference_paper_apa_text
})
try:
button = browser.find_element_by_xpath('//*[@id="gs_nm"]/button[2]')
button_onclick = button.get_attribute('onclick')
if button_onclick is not None:
button.click()
else:
flag = False
except EC.NoSuchElementException:
flag = False
# try:
#
# browser.find_element_by_xpath('//*[@id="gs_nm"]/button[2]').click()
# time.sleep(4)
# flag = True
# except WebDriverException:
# flag = False
json.dump(this_paper_information, open(os.path.join(this_paper_dir_name, 'info.json'), 'w+'))
pprint.pprint(this_paper_information)