#!/usr/bin/python
"""
Download blog entries as a backup.
Capture all page elements. Store complete pages as ZIP archives.
Redownload and overwrite archives to capture changed entries (comments), until entries hit max-age.

Does not require a site feed.

Copyright 2009 Richard Esplin -- richard-oss@esplins.org
Licensed under the GNU GPL V3 or (at your option) any later version.

Requires python 2.5 and python-mechanize
"""

import sys, os, shutil, re, zipfile, datetime
from os import path
import mechanize
from mechanize import Browser

blog_url = "http://YOURBLOG" # Base URL -- first visit directs to login page
archive_url = blog_url # URL with links to all posts (or all recent posts): a feed, or a javascript menu.
backup_dir = ""
maxage = 8 # don't download posts older than this many weeks, rounded to include entire month

username = 'YOURUSER'
password = 'YOURPASS'

tmp_dir = path.join(backup_dir, "tmp")

class ExceptionDocumentEmpty: Exception
class ExceptionNoTagSrc: Exception


def uniquelist(dupslist):
    # Duncan Booth on http://bytes.com/groups/python
    d = {}
    return [ d.setdefault(x,x) for x in dupslist if x not in d ]


def setupBrowser():
    browser = Browser()

    # Override user agent
    for h in browser.addheaders:
        if 'User-agent' in h:
            browser.addheaders.remove(h)
    browser.addheaders.append(('User-agent', "Konqueror/3.5; Linux KHTML/3.5.10"))

    return browser


def login(browser):
    if browser.title().lower().find('sign in') >= 0:
        browser.select_form(predicate=lambda f: 'id' in f.attrs and f.attrs['id']=='loginform')
        browser.set_value(username, name="Email")
        browser.set_value(password, name="Passwd")
        browser.submit()
    else: print "Login failed!"
    return browser


def grabPage(url, localfilepath):
    '''download url without any corrections or recursion'''
    # Strip quotes from URL
    if url.startswith('"') or url.startswith("'"): url = url[1:]
    if url.endswith('"') or url.endswith("'"): url = url[:-1]
    # Get page and save it
    try:
        br.open(url)
    except Exception, e:
        #print "URL %s could not be downloaded: %s"%(url, e)
        return False # Continue processing
    r = br.response()
    f = open(localfilepath, 'w')
    for l in r.xreadlines():
        f.write(l)
    f.close()
    return True


def strip_chars(str):
    'Remove all characters that should not be in a directory or filename'
    if str == '' or str == None: return ''
    r = re.compile('[^A-Za-z0-9.]')
    return ''.join(r.split(str))


# Regular expression pulls out two groups: the last directory and the filename.
# The directory name can be used if the filename is blank or not unique.
urlmatch = re.compile('["\']http.*?/([A-Za-z0-9%.\r\n\t \-_]*)/([A-Za-z0-9%.\r\n\t \-_]*)["\']', re.DOTALL)
def cleanAndFollow(line, destdir):
    'Download tag destination, and point tag at local file'
    cleanAndFollow.cleaned = False # not global scope, but still accessible to inner functions
    def cleanlink(line, attribute, startpos):
        # extract url and filename from link
        attribute_index = line.lower().find(attribute + '=', startpos)
        startpos = attribute_index+len(attribute)+1
        matchobject = urlmatch.search(line[startpos:])
        try:
            remoteurl = matchobject.group()
            lastdir, filename = matchobject.groups()
        except AttributeError, e:
            # It's probably embedded javascript or css, with no src
            #print "Failed regex match on line: %s\n%s" %(line, e)
            raise ExceptionNoTagSrc
        # find local filename
        lastdir = strip_chars(lastdir)
        filename = strip_chars(filename)
        if not (filename or lastdir): filename = "blank"
        if not filename:
            filename = lastdir
        if path.exists(path.join(destdir, filename)):
            filename = lastdir + "_" + filename
            if path.exists(path.join(destdir, filename)):
                enumerate = 0
                while path.exists(path.join(destdir, filename + str(enumerate))):
                    enumerate += 1
                filename += str(enumerate)
        # make sure can download page, or there is no point in changing links
        if not grabPage(remoteurl, path.join(destdir, filename)):
            return line
        # substitute path in URL with local filename
        newline = urlmatch.sub('"%s"'%filename, line[startpos:], count=1)
        newline = line[:startpos] + newline
        return newline
    def checklink(line, lastindex, biggestindex, tag, attribute):
        matched = False
        index = line.lower().find(tag, lastindex)
        if index >= 0:
            try:
                line = cleanlink(line, attribute, index)
            except ExceptionNoTagSrc:
                index = -1
            else:
                cleanAndFollow.cleaned = True
                matched = True
        if index > biggestindex: return matched, index, line
        else: return matched, biggestindex, line
     # check for more than one tag on the line
    lastindex = 0
    biggestindex = 0
    while True:
        imgmatch, biggestindex, line = checklink(line, lastindex, biggestindex, '<img ', 'src')
        linkmatch, biggestindex, line = checklink(line, lastindex, biggestindex, '<link ', 'href')
        scriptmatch, biggestindex, line = checklink(line, lastindex, biggestindex, '<script ', 'src')
        lastindex = biggestindex + 1
        if not (imgmatch or linkmatch or scriptmatch): break
    if cleanAndFollow.cleaned: return line
    else: return False


def opentag(line):
    if line.count('<') != line.count('>'):
        return True
    else: return False


def snarfPage(response, destdir):
    """Save page with all elements to local directory.
       Rewrite internal links to point locally."""
    pos = response.tell()
    cleaned_response = []
    os.mkdir(destdir)
    # Pull all lines out of page before we start snarfing, or our result
    # will go away with the first embedded link request.
    lines = response.readlines()
    if len(lines) == 0: raise ExceptionDocumentEmpty
    # Process line by line, combining lines with a break in a tag
    lines.reverse() # so pop works
    l = lines.pop()
    while l:
        # check for lines that contain only part of a tag
        maxlines = 5
        count = 0
        while opentag(l):
            try:
                l += "\n" + lines.pop()
            except IndexError: break
            count += 1
            if count >= maxlines: break
        # check if tag needs to be cleaned and followed
        cleaned = cleanAndFollow(l, destdir)
        if cleaned != False:
            cleaned_response.append(cleaned)
        else: cleaned_response.append(l)
        try:
            l = lines.pop()
        except IndexError: l = None
    response.seek(pos) # Set file handle back in case caller needs it
    if len(cleaned_response) == 0:
        raise ExceptionDocumentEmpty
    f = open(path.join(destdir, 'index.html'), 'w')
    f.writelines(cleaned_response)
    f.close()


# Expect posting names to be blog_url/YYYY/MM/slug.ext?junk
nameregex ='%s/(\d\d\d\d)/(\d\d)/([A-Za-z0-9%%\-_]*)[.\?]*\.*'%blog_url
namematch =re.compile(nameregex)
def backupPage(url):
    """Save the page with all elements as a zipped archive."""
    # Use the title as the archive filename (tmpdir dirname)
    br.open(url)
    try:
        year, month, slug = namematch.search(url).groups()
    except AttributeError, e:
        filename = strip_chars(br.title) # unexpected URL, just use title
    else:
        filename = strip_chars('%s%s_%s'%(year,month,slug))

    # Download all page elements
    tmp_file_dir = path.join(tmp_dir, filename)
    snarfPage(br.response(), tmp_file_dir)

    # Zip up the directory into a backup archive
    orig_dir = os.getcwd()
    os.chdir(tmp_dir) # zip needs paths to be relative to archive
    archive_name = tmp_file_dir + '.zip'
    archive = zipfile.ZipFile(archive_name, 'w', zipfile.ZIP_DEFLATED)
    for root, dirs, files in os.walk(tmp_file_dir):
        rel_root = root[len(tmp_dir)+1:] #Get rid of the trailing slash too
        for f in files:
            archive.write(path.join(rel_root, f))
    os.chdir(orig_dir)
    shutil.move(archive_name, path.join(backup_dir, filename + '.zip'))
    # Now that we have an archive, clean up tmp
    shutil.rmtree(tmp_file_dir)


def getpagelist():
    '''Get list of all entries from archive_url, then filter it to only look at
    recent posts.'''
    br.open(archive_url)

    # Determine the oldest date we want to download from
    oldest_date = datetime.date.today() - datetime.timedelta(weeks=maxage)

    # Gotta pull all the links out before following any, because the link
    # generator will go away on the next page request
    pages = []
    for link in br.links(url_regex=nameregex):
        # Filter list to only look at recent ones
        year, month = namematch.search(link.absolute_url).groups()[:2]
        postdate = datetime.date(year=int(year), month=int(month), day=1)
        # round to include the entire month, because that is what's in the URL
        if postdate >= oldest_date or (postdate.year >= oldest_date.year and \
                                       postdate.month >= oldest_date.month):
            pages.append(link.absolute_url)

    pages = uniquelist(pages)
    return pages


if __name__ == "__main__":
    # Initialize globals
    global br
    br = setupBrowser()

    # Get list of entries
    br.open(blog_url)
    br = login(br)

    # Get list of pages to backup
    pages = getpagelist()

    # Make the magic happen!
    for p in pages:
        backupPage(p)


# vim:et sw=4 ts=4 sta

