#### NOTE: AFTER RUNNING, I CHANGED revision_editor_dumb to
#### revision_editor_sane in query. Shouldn't matter much at all for
#### results, but FYI, I did it. Could have a minor change.
# Templates that matter:
#
# adoptee: indicates someone is actually an adoptee of someone (and who)
#
# adopter: indicates someone is actually an adopter of someone (and who)
#
# Find adoptee/adopter pairs; then for each, log by timestamp whenever
# a communication between the two of them happened. A communication is
# defined as the adoptee posting on an adopter's talk page, or
# vice-versa. Note that it is possible communication between the
# occurs by posting on their own pages, but I can't easily pull that
# out.
#
# Technique:
# 1. Find all adoptee / adopter pairs, either via adoptee or adopter
# templates.
# 2. Go to talk pages for each adoptee and adopter pair, looking to
# see who posted first on the other's page.
#
# NOTE: YOU'RE NOT HANDLING CAPITALIZATION RIGHT, TAKE ANOTHER LOOK
# ALSO LOOK AT THE RESULTS, SOMETHING IS STILL JUST NOT QUITE RIGHT;
# SEEMS LIKE LOTS OF TALK PAGE POSTINGS ARE MISSING

import cPickle
import sys
import time
import collections
import itertools
import numpy
import logging
import psycopg2
import psycopg2.extensions

input = open('/scratch/bento/music/results2010/userPageHistory.out','r')
allhist = cPickle.load(input)
input.close()

# adoptee and adopter start looking to close to each other in code,
# and it's going to be too easy to mix up. Renaming as parent and child.

adoptionPair = set()
childLonely = set()   # Children who don't specify parent
parentLonely = set()  # Parents who don't specify children

# Loop over all user pages
count = 0

#allhist=allhist[1:5]   # DEBUG

for row in allhist:
    count += 1
    print count,

    pageId = row[0]
    indivHist = row[1]

    # PageId is nice, but for purposes of this exercise, I need
    # username; that's because I'm going to be looking at talk pages,
    # and they have lots of subpages.

    
    logging.basicConfig(level=logging.INFO,
                        format='%(asctime)s %(levelname)s %(message)s')

    conn = psycopg2.connect('host=bento user=wikipedia dbname=wikipedia')
    #conn.set_client_encoding('UNICODE')
    cur = conn.cursor()

    #psycopg2.extensions.register_type(psycopg2.extensions.UNICODE, cur)


    cur.execute('''SELECT DISTINCT title
                   FROM enwp_dump_20100130.page
                   WHERE page_id='''+str(pageId)+''';''')

    # It is often the case that the pageId we have here (where the
    # template occurred) is a subpage, and hence we should pull out
    # the user name.
    # Should never have more than one match here.
    # 
    #
    matches = 0
    for r in cur:
        matches += 1
        userName =  r[0].split('/')[0]
        assert matches == 1, 'Too many matches for one id'


    for revision in indivHist:

        templates = revision[1]

        for template in templates:


            # Separate out the "adoptee" or "adopter" tag from the
            # actual person, if indicated. Remove the brace tags at
            # the ends, separate at the delimiter.
            info = template.strip('{}').split('|')

            assert len(info) >= 1, 'Template did not split up correctly'

            role = info[0]  # Adopter or adoptee


            # adoptionPair is read as (child,parent)
            if role.lower() == 'adoptee':

                # I originally had tests in here to make sure that
                # template didn't have too many arguments, but a
                # handful of adoptees got it wrong. So I'm leaving the
                # test out.

                if len(info) == 1:
                    childLonely.add(userName)
                else:
                    adoptionPair.add((userName,info[1].encode('utf-8')))

            # I'm choosing to leave this code out. If an adopter
            # indicates having an adoptee, but the adoptee doesn't do
            # it, then the adoptee clearly hasn't really bought
            # in. Plus, I've got junk data from adopters who pick
            # nonsense adoptees, like Jimbo Wales.
            # elif role.lower() == 'adopter':
            #     assert len(info) <= 3, 'Template has too many arguments'
            #     if len(info) == 1:
            #         parentLonely.add(userName)
            #     elif len(info) == 2 and info[1] == '':
            #         # We have a couple of adopters who put a delimiter
            #         # in as if they are going to specify an adoptee,
            #         # but leave it blank
            #         parentLonely.add(userName)
            #     else:
            #         adoptionPair.add((info[1].encode('utf-8'),userName))
                

# Now that I've got the pairs, compare each talk page to find an entry
# by the other.


# Turn adoptionPair into a sorted list so can get predictable results
adoptionPair = list(adoptionPair)
adoptionPair.sort()


# Find all time stamps when the adoptee posted to the adopter page, or
# the other way around


outcur = conn.cursor()

outcur.execute('''DROP TABLE IF EXISTS music.e01communication;''')
outcur.execute('''CREATE TABLE music.e01communication (adoptee text, adopter text, written_by text,comm_time timestamp with time zone);''')
outQueryTempl = '''INSERT INTO music.e01communication VALUES (%s,%s,%s,
                (SELECT (TIMESTAMP WITH TIME ZONE 'epoch' + %s * INTERVAL '1 second')));'''


count = 0
for pair in adoptionPair:

    count += 1
    print count,pair


    queryTempl = """SELECT R.timestamp
                    FROM enwp_dump_20100130.revision R,
                       enwp_dump_20100130.revision_editor_sane RE,
                       enwp_dump_20100130.page P
                    WHERE RE.revision_id = R.revision_id AND 
                       R.page_id = P.page_id AND
                       RE.username=%s AND P.namespace=3 AND
                       (P.title=%s OR
                        P.title LIKE %s);"""


    data1 = pair[0],pair[1],pair[1]+"/%"
    cur = conn.cursor()
    cur.execute(queryTempl,data1)

    for r in cur:
        timestamp = r[0]
        outcur.execute(outQueryTempl,(pair[0],pair[1],"adoptee",timestamp))

    data2 = pair[1],pair[0],pair[0]+"/%"
    cur = conn.cursor()
    cur.execute(queryTempl,data2)

    for r in cur:
        timestamp =  r[0]
        outcur.execute(outQueryTempl,(pair[0],pair[1],"adopter",timestamp))


cur.execute('''CREATE INDEX e01communication_adoptee_idx on music.e01communication (adoptee);''')

conn.commit()
cur.close()
conn.close()

print 'Done'