How to Make a Youtube Comment Corpus (without API)
on Nlp, Crawling, Youtube, Corpus, Social media, Python
All ways to do it
Of course, Youtube has its own useful API
But if you are reading this article, you probably already know it has strict limitations, including the number of comments collected per day.
If you are a researcher who would like to collect a small dataset, it is possible to reproduce all the manual behavour in a way described below.
Collecting Comments
Since we aren’t using proposed functionality, the script should pretend it is a user scrolling through the trends.
Our solution will be based on Selenium + CSSSelector - let’s start!
import os, sys, time, datetime
import json
import requests
import lxml.html
import io, codecs
from tqdm import tqdm
from lxml.cssselect import CSSSelector
#to make the Browser Working
from selenium import webdriver
#Send keycodes to Elements
from selenium.webdriver.common.keys import Keys
#scrape the url's and comments
from bs4 import BeautifulSoup
Selenium - rolling through trends
Firstly, we will scroll the trends page with Selenium. You should have Firefox installed to use the code below – or change for your browser.
# The List where the links to the videos are stored
links = set()
comments = list()
homePage = 'https:www.youtube.com'
linksSize = 10
driver = webdriver.Firefox()
output_path = os.path.join("/media/user/youtube/", str(datetime.date.today()),".txt")
def loadFullPage(Timeout):
reachedbottom = None
while not reachedbottom:
#scroll one page down
driver.execute_script("window.scrollTo(0,Math.max(document.documentElement.scrollHeight,document.body.scrollHeight,document.documentElement.clientHeight));");
time.sleep(Timeout)
#check if the bottom is reached
a = driver.execute_script("return document.documentElement.scrollTop;")
b = driver.execute_script("return document.documentElement.scrollHeight - document.documentElement.clientHeight;")
relativeHeight = a / b
if(relativeHeight==1):
reachedbottom = True
def getComments(link):
driver.get(url='https:youtube.com'+link)
loadFullPage(1)
def main():
driver.get(url=homePage)
enoughLinks = None
while not enoughLinks:
loadFullPage(1)
soup = BeautifulSoup(driver.page_source, 'html.parser')
for link in soup.find_all("a",class_="yt-simple-endpoint style-scope ytd-grid-video-renderer", href=True):
if link not in links:
links.add(link['href'])
if len(links) < linksSize:
driver.refresh()
else:
#for i in range(len(links)-1000):
#links.pop()
enoughLinks = True
output = open(output_path, 'a')
for link in links:
output.write(link+"\n")
output.close()
print(links)
if __name__ == '__main__':
main()
CSSSelector - extracting comments
Now as we have saved the unique ids of the videos in trends, we will start to gather comments with requests and CSSSelector. CSSSelector is quite optional here, you can use either lxml or regular expressions, but the library in concern IMHO is most convenient for collecting the trees of comments.
We are using user agent still pretending to be Mozilla Firefox.
YOUTUBE_COMMENTS_URL = 'https://www.youtube.com/all_comments?v={youtube_id}'
YOUTUBE_COMMENTS_AJAX_URL = 'https://www.youtube.com/comment_ajax'
USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36'
def find_value(html, key, num_chars=2):
pos_begin = html.find(key) + len(key) + num_chars
pos_end = html.find('"', pos_begin)
return html[pos_begin: pos_end]
def extract_comments(html):
tree = lxml.html.fromstring(html)
item_sel = CSSSelector('.comment-item')
text_sel = CSSSelector('.comment-text-content')
time_sel = CSSSelector('.time')
author_sel = CSSSelector('.user-name')
for item in item_sel(tree):
yield {'cid': item.get('data-cid'),
'text': text_sel(item)[0].text_content(),
'time': time_sel(item)[0].text_content().strip(),
'author': author_sel(item)[0].text_content()}
def extract_reply_cids(html):
tree = lxml.html.fromstring(html)
sel = CSSSelector('.comment-replies-header > .load-comments')
return [i.get('data-cid') for i in sel(tree)]
def ajax_request(session, url, params, data, retries=10, sleep=20):
for _ in range(retries):
response = session.post(url, params=params, data=data)
if response.status_code == 200:
response_dict = json.loads(response.text)
return response_dict.get('page_token', None), response_dict['html_content']
else:
time.sleep(sleep)
def download_comments(youtube_id, sleep=1):
session = requests.Session()
session.headers['User-Agent'] = USER_AGENT
# Get Youtube page with initial comments
response = session.get(YOUTUBE_COMMENTS_URL.format(youtube_id=youtube_id))
html = response.text
reply_cids = extract_reply_cids(html)
ret_cids = []
for comment in extract_comments(html):
ret_cids.append(comment['cid'])
yield comment
page_token = find_value(html, 'data-token')
session_token = find_value(html, 'XSRF_TOKEN', 4)
first_iteration = True
# Get remaining comments (the same as pressing the 'Show more' button)
while page_token:
data = {'video_id': youtube_id,
'session_token': session_token}
params = {'action_load_comments': 1,
'order_by_time': True,
'filter': youtube_id}
if first_iteration:
params['order_menu'] = True
else:
data['page_token'] = page_token
response = ajax_request(session, YOUTUBE_COMMENTS_AJAX_URL, params, data)
if not response:
break
page_token, html = response
reply_cids += extract_reply_cids(html)
for comment in extract_comments(html):
if comment['cid'] not in ret_cids:
ret_cids.append(comment['cid'])
yield comment
first_iteration = False
time.sleep(sleep)
# Get replies (the same as pressing the 'View all X replies' link)
for cid in reply_cids:
data = {'comment_id': cid,
'video_id': youtube_id,
'can_reply': 1,
'session_token': session_token}
params = {'action_load_replies': 1,
'order_by_time': True,
'filter': youtube_id,
'tab': 'inbox'}
response = ajax_request(session, YOUTUBE_COMMENTS_AJAX_URL, params, data)
if not response:
break
_, html = response
for comment in extract_comments(html):
if comment['cid'] not in ret_cids:
ret_cids.append(comment['cid'])
yield comment
time.sleep(sleep)
def comments_main(youtube_id, output, limit=100):
try:
if not youtube_id or not output:
parser.print_usage()
raise ValueError('you need to specify a Youtube ID and an output filename')
print('Downloading Youtube comments for video:', youtube_id)
count = 0
with io.open(output, 'w', encoding='utf8') as fp:
for comment in download_comments(youtube_id):
sys.stdout.write(json.dumps(comment, ensure_ascii=False))
count += 1
sys.stdout.write('Downloaded %d comment(s)\r' % count)
sys.stdout.flush()
if limit and count >= limit:
break
print('\nDone!')
except Exception as e:
print('Error:', str(e))
sys.exit(1)
Saving comments for parsed video ids
The code above has some magic numbers, like limit=100 default parameter in comments_main – consciously not so big as the crawler is illustrating a collection of a small scientific corpus. Collecting comments in a loop:
wdir = r'/media/user/youtube/comments'
uids = open(r'/media/user/youtube/'+str(datetime.date.today())+'.txt', 'r', encoding='utf8').readlines()
uids = [i.strip('/watch?v=') for i in uids]
for i in tqdm(uids):
uid = i.strip('\n')
output = os.path.join(wdir, uid+'_'+str(datetime.date.today())+'.txt')
comments_main(uid, output)
Power-up!
Finally, what you can (optionally) do is
- configure a crontab task for your script
- make a proxy cycle with Round Robin
That’s it! You are awesome!
Full Jupyter Notebook can be found here (messy code)