-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwebnovel.py
129 lines (107 loc) · 4.9 KB
/
webnovel.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import time
import traceback
from typing import Optional
from urllib.parse import urlencode
import requests
from discord_webhook import DiscordWebhook
from requests import Response
from tqdm.auto import tqdm
class WebNovel:
__BASE_URL: str = "https://www.webnovel.com/go/pcm"
__MAX_RETRIES: int = 2
__HEADERS: dict[str, str] = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:120.0)"
}
def __init__(self, proxy_url: Optional[str], webhook_url: Optional[str]):
self.__proxy_url = proxy_url
self.__webhook: Optional[DiscordWebhook] = None
if webhook_url:
self.__webhook = DiscordWebhook(url=webhook_url)
def __send_webhook(
self, content: str, embed_description: Optional[str] = None
) -> None:
if self.__webhook is None:
return
self.__webhook.set_content(content=content)
if embed_description is not None:
self.__webhook.add_embed({"description": embed_description})
self.__webhook.execute(remove_embeds=True)
def __request(self, path: str, payload: Optional[dict] = None) -> Response:
attempt: int = 0
while attempt < self.__MAX_RETRIES:
attempt += 1
response: Optional[Response] = None
try:
url_params = urlencode(payload) if payload else ""
url = f"{self.__BASE_URL}{path}?{url_params}"
if self.__proxy_url is not None:
url = f"{self.__proxy_url}?{urlencode({'u': url})}"
response = requests.get(url=url, headers=self.__HEADERS)
response.raise_for_status()
return response
except requests.RequestException:
if response is not None and response.status_code == 429:
message = "RATE LIMITED. Sleeping for a minute and retrying"
self.__send_webhook(f":warning: {message}")
tqdm.write(message)
time.sleep(60)
else:
if response is not None:
message = (
f"Request failed with HTTP error {response.status_code}"
)
embed_description = None
else:
message = f"Request failed with error"
embed_description = "```py\n" + traceback.format_exc() + "```"
sleep_time: Optional[int] = None
if attempt < self.__MAX_RETRIES:
sleep_time: Optional[int] = 2**attempt
message += f"\nAttempt {attempt}/{self.__MAX_RETRIES}"
if sleep_time is not None:
message += f"\nRetrying in {sleep_time} minutes"
self.__send_webhook(
content=f":warning: {message}",
embed_description=embed_description,
)
tqdm.write(message)
if sleep_time is not None:
time.sleep(sleep_time*60)
message = f"Failed to get response after {self.__MAX_RETRIES} tries"
self.__send_webhook(f":warning::warning::warning: {message}")
raise Exception(message)
def __api_request(self, path: str, payload: Optional[dict] = None) -> dict:
try:
return self.__request(path, payload).json()
except requests.exceptions.JSONDecodeError:
message = f"Malformed json response, likely captcha"
self.__send_webhook(f":warning::warning::warning: {message}")
raise Exception(message)
def __category_request(self, page: Optional[int] = None) -> dict:
payload: dict[str, int] = {
"bookStatus": 0,
"categoryId": 0,
"categoryType": 2,
"orderBy": 5,
}
if page is not None:
payload["pageIndex"] = page
return self.__api_request("/category/categoryAjax", payload)
def get_pagination_info(self) -> tuple[int, int]:
data: dict = self.__category_request()["data"]
return len(data["items"]), data["total"]
def get_comic_ids(self, page: int) -> list[int]:
items: list[dict] = self.__category_request(page)["data"]["items"]
return list(map(lambda item: int(item["bookId"]), items))
def get_chapter_ids(self, comic_id: int) -> list[int]:
response: dict = self.__api_request(
"/comic/getChapterList", {"comicId": comic_id}
)
return list(
map(lambda item: int(item["chapterId"]), response["data"]["comicChapters"])
)
def get_chapter_upload_time(self, comic_id: int, chapter_id: int) -> int:
response: dict = self.__api_request(
"/comic/getContent", {"comicId": comic_id, "chapterId": chapter_id}
)
return response["data"]["chapterInfo"]["publishTime"]