-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsocialcheck.py
73 lines (52 loc) · 1.93 KB
/
socialcheck.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import tomllib
import requests
from bs4 import BeautifulSoup
from scrapy.selector import Selector
from lxml import html as lxhtml
from brightdata import proxy_servers
def get_social_xpath(filename="socials.toml"):
socials = list()
with open(filename, "rb") as f:
data = tomllib.load(f)
for name, item in data.items():
socials.append({"name": name, **item})
s = [f"contains(@href, '{site.get('domain')}')" for site in socials]
s = " or ".join([f"contains(@href, '{site.get('domain')}')" for site in socials])
s = f".//a[{s}]"
return s
def get_html(url):
response = requests.get(url, proxies=proxy_servers)
assert response.status_code == 200
html = response.text
return html
def meta_info(url, soup=None):
if not soup:
html = get_html(url)
soup = BeautifulSoup(html, features="lxml")
tree = lxhtml.fromstring(html)
# Get meta info
tree_see_also = lxhtml.fromstring(html)
see_also = tree_see_also.xpath("//meta[@property='og:see_also']/@content")
for leaf in see_also:
yield "meta-see-also", leaf
twitter_site = tree.xpath("//meta[@name='twitter:site']/@content")
for leaf in twitter_site:
yield "meta-twitter-site", leaf
icons = tree.xpath("//link[@rel='icon']/@href")
for leaf in icons:
yield "icons", leaf
search = tree.xpath("//link[@rel='search']/@href")
for leaf in search:
yield "search", leaf
alternate = tree.xpath("//link[@rel='alternate']/@href")
for leaf in alternate:
yield "oembed-json", leaf
social_paths = tree.xpath(get_social_xpath())
for leaf in social_paths:
href = leaf.attrib.get("href")
yield "body", href
if __name__ == "__main__":
from pprint import pprint
print(list(meta_info("https://bas.bio")))
# pprint(list(meta_info("https://onesignal.com")))
# pprint(list(meta_info("https://bas.codes/posts/this-week-python-059")))