-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathparser.py
More file actions
72 lines (62 loc) · 2.54 KB
/
parser.py
File metadata and controls
72 lines (62 loc) · 2.54 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
class CommentParser:
@staticmethod
def find_key(obj, target_key):
if isinstance(obj, dict):
for k, v in obj.items():
if k == target_key:
yield v
yield from CommentParser.find_key(v, target_key)
elif isinstance(obj, list):
for item in obj:
yield from CommentParser.find_key(item, target_key)
@staticmethod
def extract_text(obj):
if isinstance(obj, list):
return "".join(CommentParser.extract_text(i) for i in obj)
elif isinstance(obj, dict):
if "runs" in obj:
return CommentParser.extract_text(obj["runs"])
if "text" in obj:
return str(obj.get("text", ""))
return ""
elif isinstance(obj, (str, int, float)):
return str(obj)
return ""
@classmethod
def extract_batch(cls, data):
comments = []
next_token = None
for renderer in cls.find_key(data, "commentRenderer"):
author = cls.extract_text(renderer.get("authorText", {}))
thumbs = renderer.get("authorThumbnail", {}).get("thumbnails", [])
pfp = max(thumbs, key=lambda x: x.get("width", 0)).get("url") if thumbs else ""
text = cls.extract_text(renderer.get("contentText", {}))
reply_count = cls.extract_text(renderer.get("replyCount", {}))
likes = cls.extract_text(renderer.get("voteCount", {}))
if text:
comments.append({
"type": "standard",
"author": author or "",
"text": text or "",
"replyCount": reply_count or "",
"likes": likes or "",
"pfp_url" : pfp or "",
})
if not comments:
for vm in cls.find_key(data, "commentViewModel"):
text = cls.extract_text(vm.get("content", {}).get("content", ""))
if text:
comments.append({
"type": "viewmodel",
"author": "User",
"text": text,
"replyCount": "",
"likes": ""
})
for item in cls.find_key(data, "continuationItemRenderer"):
try:
next_token = item["continuationEndpoint"]["continuationCommand"]["token"]
break
except (KeyError, TypeError):
continue
return comments, next_token