Draft Notes Extractor

Contents

Overview

This Python code extracts draft notes from subtitles. The input file is a plain text file with Youtube Video URLs. That's a command-line application. The output is a plain text file containing the draft notes in markdown format. Open this file, copy its contents and paste them to wiki.

For Help

Execute the following command on a terminal (UNIX/Linux) or command-prompt (Windows):

python create_draft_notes.py -h

The command should output the following:

Usage: create_draft_notes.py [options]

Options:
  -h, --help            show this help message and exit
  -i INPUTFILE, --inputfile=INPUTFILE
                        input file, plain text file containing a list of
                        youtube video ids or urls (one per line)
  -u UNIT, --unit=UNIT  unit name, e.g., "Unit 2: Roads? Where We're Going, We
                        Don't Need Roads."
  -o OUTPUTFILE, --outputfile=OUTPUTFILE
                        output file, markdown formatted draft notes. default
                        is output.txt
  -m MODE, --mode=MODE  mode can be either url or id. default is url

Example Call

Execute the following command on a terminal (UNIX/Linux) or command-prompt (Windows):

python create_draft_notes.py -i video-urls-unit1.txt -u "Unit 1: Houston We Have a Problem" -o unit1-draft.txt

The input file should be a plain text file similar to this:

File Name: video-urls-unit1.txt

http://www.youtube.com/watch?v=8cLXVG2Q6D4
http://www.youtube.com/watch?v=oVNdwovOTEw
http://www.youtube.com/watch?v=kqkxeUyeNLI
...
...
...

The command should output something similar to the following:

output file is unit1-draft.txt...
extracting: Unit 1: Houston We Have a Problem ...
processing: cs222 unit1 01 l Welcome ...
processing: cs222 unit1 02 l Apollo 13 ...
processing: cs222 unit1 03 q Whats Important ...
...
...
...
*** extraction finished ***

Source Code

File Name: create_draft_notes.py

import sys
import base64
import re
from lxml import etree
from StringIO import StringIO
from PyQt4.QtGui import *
from PyQt4.QtCore import *
from PyQt4.QtWebKit import *
from PyQt4 import QtNetwork
import gdata.youtube
import gdata.youtube.service
from HTMLParser import HTMLParser
from django.utils.encoding import smart_str, smart_unicode
from optparse import OptionParser

# Creates notes from Udacity's Youtube video subtitles.
# The following python modules are required to run this script:
#
# - PyQt4
# - GData
# - DJango
#
# Author: Marcio Gualtieri, October 2012

class WebPage(QWebPage):
    "Loads and render webpage (including javascript) to html"
    def __init__(self, app, url, username=None, password=None):
        self.app = app

        # build request
        url = QUrl(url)
        req = QtNetwork.QNetworkRequest()
        req.setUrl(url)

        # authentication
        if username is not None and password is not None:
            base64string = base64.encodestring('%s:%s' % (username, password))[:-1]
            authheader = "Basic %s" % base64string
            headerKey = QByteArray("Authorization")
            headerValue = QByteArray(authheader)
            req.setRawHeader(headerKey, headerValue)

        # make request
        QWebPage.__init__(self)
        self.loadFinished.connect(self._loadFinished)
        self.mainFrame().load(req)
        self.app.exec_()

    def _loadFinished(self, result):
        self.app.quit()

    def renderFrameToHTML(self, name):
        if name == 'main': return smart_str(self.mainFrame().toHtml())
        for f in self.mainFrame().childFrames():
            if f.frameName() == name: return smart_str(f.toHtml())

class Video():
    "Retrieves youtube video info and subtitles"
    def __init__(self, app, url=None, id=None):
        self.HEAD_URL='http://video.google.com/timedtext?hl=en&lang=en&name=via%20dotsub&v='

        # accepts either video url or video id
        if url is None and id is not None:
            self.id = id
        elif id is None and url is not None:
            self.id = re.split(r'[\&\v=]', url)[1]
        else:
            raise Exception('either video id or video url is required')

        self.url = self.HEAD_URL + self.id
        self.text = None

        # retrieve video info
        yt_service = gdata.youtube.service.YouTubeService()
        entry = yt_service.GetYouTubeVideoEntry(video_id=self.id)
        self.title = entry.media.__dict__['title'].__dict__['text']

        page =  WebPage(app, self.url)
        frame = page.renderFrameToHTML('main')
        if len(frame) > 0:
            tree = etree.parse(StringIO(frame))
            paragraph = ''
            sentenceList = tree.findall("text")
            if sentenceList != None:
                for sentence in sentenceList:
                    paragraph += ' ' + sentence.text
            self.text = smart_str(HTMLParser().unescape(paragraph))
        else:
            self.text = None
        if(len(self.text) == 0): self.text = None

class Output():
    "Writes to output file in markdown syntax"
    def __init__(self, filename):
        self.outfile = open(filename, 'w')

    def close(self):
        self.outfile.close()

    def writeToc(self):
        # that's just because OSQA is buggy and tries to render the TOC tag inside
        # text formatted as source code
        self.outfile.write("["+"TOC"+"]\n\n")

    def writeSection(self, text):
        marker = ''.ljust(len(text), '=')
        self.outfile.write("%s\n%s\n\n" % (text, marker))

    def writeSubsection(self, text):
        marker = ''.ljust(len(text), '-')
        self.outfile.write("%s\n%s\n\n" % (text, marker))

    def writeText(self, text):
        self.outfile.write("%s" % (text))

    def writeLink(self, url, text):
        self.outfile.write("[%s](%s)\n\n" % (text, url))


# command-line arguments parsing
parser = OptionParser()
parser.add_option("-i", "--inputfile", dest="inputfile",
                  help="input file, plain text file containing a list of youtube video ids or urls (one per line)")
parser.add_option("-u", "--unit", dest="unit",
                  help="unit name, e.g., \"Unit 2: Roads? Where We're Going, We Don't Need Roads.\"")
parser.add_option("-o", "--outputfile", dest="outputfile", default="output.txt",
                  help="output file, markdown formatted draft notes. default is output.txt")
parser.add_option("-m", "--mode", dest="mode", default="url",
                  help="mode can be either url or id. default is url")
(options, args) = parser.parse_args()
if not options.inputfile:
    parser.error('input file not given')
if not options.unit:
    parser.error('unit name not given')

# start qt application which runs web page loader
app = QApplication(sys.argv)

# output file (unit name)
print 'output file is %s...' % options.outputfile
out = Output(options.outputfile)

# unit name
print "extracting: %s ..." % options.unit
out.writeSection(options.unit)
out.writeText("%s\n\n" %  "**These are draft notes from subtitles. Please help to improve them. Thank you!**")

# get youtube video ids
with open(options.inputfile) as file_handler:
    lines = file_handler.readlines()
videos = [line.strip() for line in lines]

# table of contents
out.writeToc()

# extract and write contents
for v in videos:
    if options.mode == 'id':
        url = 'http://www.youtube.com/watch?v=' + v
    else:
        url = v
    video = Video(app, url=url)
    print "processing: %s ..." % video.title
    title = re.split(r'[\-]', video.title)
    # create subsection
    out.writeSubsection(title[0].strip())
    if video.text is not None:
        out.writeText("%s\n\n" %  video.text)
    else:
        out.writeLink(url, "Video has no subtitles.")

print "*** extraction finished ***"
out.close()