#!/usr/bin/env python
"""Grab URLs from the clipboard, interpret the queries as OpenID, and print.

In addition to URLs, I also scan for queries as they appear in httpd log files,
with a pattern like 'GET /foo?bar=baz HTTP'.

Requires the 'xsel' program to get the contents of the clipboard.
"""
from __future__ import unicode_literals

import re
import subprocess
import sys
from pprint import pformat

import six
from six.moves.urllib.parse import parse_qs, urlsplit, urlunsplit

from openid import message

OPENID_SORT_ORDER = ['mode', 'identity', 'claimed_id']


class NoQuery(Exception):
    def __init__(self, url):
        self.url = url

    def __str__(self):
        return "No query in url %s" % (self.url,)


def getClipboard():
    xsel = subprocess.Popen(["xsel", "-o", "-b"], stdout=subprocess.PIPE)
    output = xsel.communicate()[0]
    return output


def main():
    source = getClipboard()
    urls = find_urls(source)

    errors = []
    output = []
    queries = []

    queries.extend(queriesFromPostdata(source))

    for url in urls:
        try:
            queries.append(queryFromURL(url))
        except NoQuery as err:
            errors.append(err)

    queries.extend(queriesFromLogs(source))

    for where, query in queries:
        output.append('at %s:\n%s' % (where, openidFromQuery(query)))

    if output:
        print('\n\n'.join(output))
    elif errors:
        for err in errors:
            print(err)


def queryFromURL(url):
    split_url = urlsplit(url)
    query = parse_qs(split_url[3])

    if not query:
        raise NoQuery(url)

    url_without_query = urlunsplit(split_url[:3] + (None, None))

    return (url_without_query, query)


def openidFromQuery(query):
    try:
        msg = message.Message.fromPostArgs(unlistify(query))
        s = formatOpenIDMessage(msg)
    except Exception as err:
        # XXX - side effect.
        sys.stderr.write(six.text_type(err))
        s = pformat(query)

    return s


def formatOpenIDMessage(msg):
    value_lists = {}
    for (ns_uri, ns_key), value in msg.args.items():
        l = value_lists.setdefault(ns_uri, {})
        l[ns_key] = value

    output = []

    for ns_uri, values in value_lists.items():
        ns_output = []

        alias = msg.namespaces.getAlias(ns_uri)
        if alias is message.NULL_NAMESPACE:
            alias = 'openid'
        ns_output.append("  %s <%s>" % (alias, ns_uri))

        for key in OPENID_SORT_ORDER:
            try:
                ns_output.append("    %s = %s" % (key, values.pop(key)))
            except KeyError:
                pass

        values = sorted(values.items())

        for k, v in values:
            ns_output.append("    %s = %s" % (k, v))

        output.append('\n'.join(ns_output))

    return '\n\n'.join(output)


def unlistify(d):
    return dict((i[0], i[1][0]) for i in d.items())


def queriesFromLogs(s):
    qre = re.compile(r'GET (/.*)?\?(.+) HTTP')

    return [(match.group(1), parse_qs(match.group(2)))
            for match in qre.finditer(s)]


def queriesFromPostdata(s):
    # This looks for query data in a line that starts POSTDATA=.
    # Tamperdata outputs such lines.  If there's a 'Host=' in that block,
    # use that too, but don't require it.
    qre = re.compile(r'(?:^Host=(?P<host>.+?)$.*?)?^POSTDATA=(?P<query>.*)$',
                     re.DOTALL | re.MULTILINE)
    return [(match.group('host') or 'POSTDATA',
             parse_qs(match.group('query'))) for match in qre.finditer(s)]


def find_urls(s):
    # Regular expression borrowed from urlscan
    # by Daniel Burrows <dburrows@debian.org>, GPL.
    urlinternalpattern = r'[{}a-zA-Z/\-_0-9%?&.=:;+,#~]'
    urltrailingpattern = r'[{}a-zA-Z/\-_0-9%&=+#]'
    httpurlpattern = r'(?:https?://' + urlinternalpattern + r'*' + urltrailingpattern + r')'
    # Used to guess that blah.blah.blah.TLD is a URL.
    tlds = ['biz', 'com', 'edu', 'info', 'org']
    guessedurlpattern = r'(?:[a-zA-Z0-9_\-%]+(?:\.[a-zA-Z0-9_\-%]+)*\.(?:' + '|'.join(tlds) + '))'
    urlre = re.compile(r'(?:<(?:URL:)?)?(' + httpurlpattern + '|' + guessedurlpattern +
                       '|(?:mailto:[a-zA-Z0-9\-_]*@[0-9a-zA-Z_\-.]*[0-9a-zA-Z_\-]))>?')

    return [match.group(1) for match in urlre.finditer(s)]


if __name__ == '__main__':
    main()
