How to Make a Youtube Comment Corpus (without API)

08 Jan 2020 on Nlp, Crawling, Youtube, Corpus, Social media, Python

Social Media are becoming less and less available for independent DS researchers. In 2020 Youtube still seems to be a small island of freedom. Thought APIs impose restrictions, we will go around all the prohibitions and crawl a small corpus for analysis. How? See my notes!

All ways to do it

Of course, Youtube has its own useful API

But if you are reading this article, you probably already know it has strict limitations, including the number of comments collected per day.

If you are a researcher who would like to collect a small dataset, it is possible to reproduce all the manual behavour in a way described below.

Collecting Comments

Since we aren’t using proposed functionality, the script should pretend it is a user scrolling through the trends.

Our solution will be based on Selenium + CSSSelector - let’s start!

import os, sys, time, datetime
import json
import requests
import lxml.html
import io, codecs
from tqdm import tqdm
from lxml.cssselect import CSSSelector
#to make the Browser Working
from selenium import webdriver
#Send keycodes to Elements
from selenium.webdriver.common.keys import Keys
#scrape the url's and comments
from bs4 import BeautifulSoup

Selenium - rolling through trends

Firstly, we will scroll the trends page with Selenium. You should have Firefox installed to use the code below – or change for your browser.

# The List where the links to the videos are stored
links = set()
comments = list()
homePage = 'https:www.youtube.com'
linksSize = 10
driver = webdriver.Firefox()
output_path = os.path.join("/media/user/youtube/", str(datetime.date.today()),".txt")
 
def loadFullPage(Timeout):
    reachedbottom = None
    while not reachedbottom:
        #scroll one page down
        driver.execute_script("window.scrollTo(0,Math.max(document.documentElement.scrollHeight,document.body.scrollHeight,document.documentElement.clientHeight));");
        time.sleep(Timeout)
        #check if the bottom is reached
        a = driver.execute_script("return document.documentElement.scrollTop;")
        b = driver.execute_script("return document.documentElement.scrollHeight - document.documentElement.clientHeight;")
        relativeHeight = a / b
        if(relativeHeight==1):
            reachedbottom = True
def getComments(link):
    driver.get(url='https:youtube.com'+link)
    loadFullPage(1)

def main():
    driver.get(url=homePage)
    enoughLinks = None
    while not enoughLinks:
        loadFullPage(1)
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        for link in soup.find_all("a",class_="yt-simple-endpoint style-scope ytd-grid-video-renderer", href=True):
            if link not in links:
                links.add(link['href'])
        if len(links) < linksSize:
            driver.refresh()
        else:
            #for i in range(len(links)-1000):
                #links.pop()
            enoughLinks = True
    output = open(output_path, 'a')
    for link in links:
        output.write(link+"\n")
    output.close()
    print(links)

if __name__ == '__main__':
    main()

CSSSelector - extracting comments

Now as we have saved the unique ids of the videos in trends, we will start to gather comments with requests and CSSSelector. CSSSelector is quite optional here, you can use either lxml or regular expressions, but the library in concern IMHO is most convenient for collecting the trees of comments.

We are using user agent still pretending to be Mozilla Firefox.

YOUTUBE_COMMENTS_URL = 'https://www.youtube.com/all_comments?v={youtube_id}'
YOUTUBE_COMMENTS_AJAX_URL = 'https://www.youtube.com/comment_ajax'
USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36'

def find_value(html, key, num_chars=2):
    pos_begin = html.find(key) + len(key) + num_chars
    pos_end = html.find('"', pos_begin)
    return html[pos_begin: pos_end]

def extract_comments(html):
    tree = lxml.html.fromstring(html)
    item_sel = CSSSelector('.comment-item')
    text_sel = CSSSelector('.comment-text-content')
    time_sel = CSSSelector('.time')
    author_sel = CSSSelector('.user-name')

    for item in item_sel(tree):
        yield {'cid': item.get('data-cid'),
               'text': text_sel(item)[0].text_content(),
               'time': time_sel(item)[0].text_content().strip(),
               'author': author_sel(item)[0].text_content()}


def extract_reply_cids(html):
    tree = lxml.html.fromstring(html)
    sel = CSSSelector('.comment-replies-header > .load-comments')
    return [i.get('data-cid') for i in sel(tree)]

def ajax_request(session, url, params, data, retries=10, sleep=20):
    for _ in range(retries):
        response = session.post(url, params=params, data=data)
        if response.status_code == 200:
            response_dict = json.loads(response.text)
            return response_dict.get('page_token', None), response_dict['html_content']
        else:
            time.sleep(sleep)

def download_comments(youtube_id, sleep=1):
    session = requests.Session()
    session.headers['User-Agent'] = USER_AGENT

    # Get Youtube page with initial comments
    response = session.get(YOUTUBE_COMMENTS_URL.format(youtube_id=youtube_id))
    html = response.text
    reply_cids = extract_reply_cids(html)
    ret_cids = []
    for comment in extract_comments(html):
        ret_cids.append(comment['cid'])
        yield comment

    page_token = find_value(html, 'data-token')
    session_token = find_value(html, 'XSRF_TOKEN', 4)
    first_iteration = True
    # Get remaining comments (the same as pressing the 'Show more' button)
    while page_token:
        data = {'video_id': youtube_id,
                'session_token': session_token}

        params = {'action_load_comments': 1,
                  'order_by_time': True,
                  'filter': youtube_id}
        if first_iteration:
            params['order_menu'] = True
        else:
            data['page_token'] = page_token
        response = ajax_request(session, YOUTUBE_COMMENTS_AJAX_URL, params, data)
        if not response:
            break
        page_token, html = response
        reply_cids += extract_reply_cids(html)
        for comment in extract_comments(html):
            if comment['cid'] not in ret_cids:
                ret_cids.append(comment['cid'])
                yield comment

        first_iteration = False
        time.sleep(sleep)

    # Get replies (the same as pressing the 'View all X replies' link)
    for cid in reply_cids:
        data = {'comment_id': cid,
                'video_id': youtube_id,
                'can_reply': 1,
                'session_token': session_token}
        params = {'action_load_replies': 1,
                  'order_by_time': True,
                  'filter': youtube_id,
                  'tab': 'inbox'}
        response = ajax_request(session, YOUTUBE_COMMENTS_AJAX_URL, params, data)
        if not response:
            break
        _, html = response
        for comment in extract_comments(html):
            if comment['cid'] not in ret_cids:
                ret_cids.append(comment['cid'])
                yield comment
        time.sleep(sleep)


def comments_main(youtube_id, output, limit=100):
    try:
        if not youtube_id or not output:
            parser.print_usage()
            raise ValueError('you need to specify a Youtube ID and an output filename')
        print('Downloading Youtube comments for video:', youtube_id)
        count = 0
        with io.open(output, 'w', encoding='utf8') as fp:
            for comment in download_comments(youtube_id):
                sys.stdout.write(json.dumps(comment, ensure_ascii=False))
                count += 1
                sys.stdout.write('Downloaded %d comment(s)\r' % count)
                sys.stdout.flush()
                if limit and count >= limit:
                    break
        print('\nDone!')
    except Exception as e:
        print('Error:', str(e))
        sys.exit(1)

Saving comments for parsed video ids

The code above has some magic numbers, like limit=100 default parameter in comments_main – consciously not so big as the crawler is illustrating a collection of a small scientific corpus. Collecting comments in a loop:

wdir = r'/media/user/youtube/comments'
uids = open(r'/media/user/youtube/'+str(datetime.date.today())+'.txt', 'r', encoding='utf8').readlines()
uids = [i.strip('/watch?v=') for i in uids]
for i in tqdm(uids):
    uid = i.strip('\n')
    output = os.path.join(wdir, uid+'_'+str(datetime.date.today())+'.txt')
    comments_main(uid, output)

Power-up!

Finally, what you can (optionally) do is

configure a crontab task for your script
make a proxy cycle with Round Robin

That’s it! You are awesome!

Full Jupyter Notebook can be found here (messy code)