|
"""Retrieve live chat content from YouTube videos.""" |
|
|
|
import json |
|
import csv |
|
import pytchat |
|
import ray |
|
|
|
# initialize ray |
|
ray.init() |
|
|
|
urls = [] |
|
|
|
# AllScraped.csv has several rows. Each row has two columns. |
|
# first column: YouTube video ID, second column is the video title. |
|
with open('AllScraped.csv', mode='r') as file_in: |
|
reader = csv.reader(file_in) |
|
for _id, row in enumerate(reader): |
|
if _id != 0: |
|
urls.append(row[0]) |
|
|
|
# after the above for loop completes execution, urls list will have list of video IDs |
|
# this list will be used at the end of this script to retrieve the live chat for each url. |
|
|
|
@ray.remote |
|
def fetch_live_chat(video_id): |
|
"""Fetch live chat messages for a YouTube video ID. |
|
Args: |
|
video_id (str): video ID. ex: https://www.youtube.com/watch?v=ID |
|
|
|
Returns: |
|
None |
|
|
|
Writes output to `video_id.json` file. |
|
Usage: fetch_live_chat('XQhBHnPIsRk') |
|
""" |
|
print("current ID:", video_id) |
|
|
|
chat = pytchat.create(video_id=video_id) |
|
chats = [] |
|
keys = ['author', 'message'] |
|
|
|
while chat.is_alive(): |
|
print("chat is alive...") |
|
for c in chat.get().sync_items(): |
|
print(f"{c.datetime} [{c.author.name}]- {c.message}") |
|
obj = json.dumps({'author': c.author.name, 'message': c.message}) |
|
chats.append(json.loads(obj)) |
|
|
|
with open(f"{video_id}.json", "w") as fout: |
|
json.dump(chats, fout) |
|
|
|
with open(f"{video_id}.csv", "w", newline='') as file_out: |
|
dict_writer = csv.DictWriter(file_out, keys) |
|
dict_writer.writeheader() |
|
dict_writer.writerows(chats) |
|
|
|
|
|
def read_from_json(video_id): |
|
"""Get comments from a JSON file.""" |
|
file_in = json.load(open(f"{video_id}.json")) |
|
|
|
# scrape all videos |
|
# this invocation (function.remote(variable)) follows ray's convention |
|
[fetch_live_chat.remote(url) for url in urls] |
|
# after executing the above list comprehension, two files are created. 1) VIDEO_ID.json, 2) VIDEO_ID.csv |
|
|
|
# if you've a set of videos that are already scraped, save their IDs in ignore_urls list |
|
# and run the below |
|
[fetch_live_chat.remote(url) for url in urls if url not in ignore_urls] |
|
# after executing the above list comprehension, two files are created. 1) VIDEO_ID.json, 2) VIDEO_ID.csv |