#### NOTE: AFTER RUNNING, I CHANGED revision_editor_dumb to #### revision_editor_sane in query. Shouldn't matter much at all for #### results, but FYI, I did it. Could have a minor change. # Templates that matter: # # adoptee: indicates someone is actually an adoptee of someone (and who) # # adopter: indicates someone is actually an adopter of someone (and who) # # Find adoptee/adopter pairs; then for each, log by timestamp whenever # a communication between the two of them happened. A communication is # defined as the adoptee posting on an adopter's talk page, or # vice-versa. Note that it is possible communication between the # occurs by posting on their own pages, but I can't easily pull that # out. # # Technique: # 1. Find all adoptee / adopter pairs, either via adoptee or adopter # templates. # 2. Go to talk pages for each adoptee and adopter pair, looking to # see who posted first on the other's page. # # NOTE: YOU'RE NOT HANDLING CAPITALIZATION RIGHT, TAKE ANOTHER LOOK # ALSO LOOK AT THE RESULTS, SOMETHING IS STILL JUST NOT QUITE RIGHT; # SEEMS LIKE LOTS OF TALK PAGE POSTINGS ARE MISSING import cPickle import sys import time import collections import itertools import numpy import logging import psycopg2 import psycopg2.extensions input = open('/scratch/bento/music/results2010/userPageHistory.out','r') allhist = cPickle.load(input) input.close() # adoptee and adopter start looking to close to each other in code, # and it's going to be too easy to mix up. Renaming as parent and child. adoptionPair = set() childLonely = set() # Children who don't specify parent parentLonely = set() # Parents who don't specify children # Loop over all user pages count = 0 #allhist=allhist[1:5] # DEBUG for row in allhist: count += 1 print count, pageId = row[0] indivHist = row[1] # PageId is nice, but for purposes of this exercise, I need # username; that's because I'm going to be looking at talk pages, # and they have lots of subpages. logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s') conn = psycopg2.connect('host=bento user=wikipedia dbname=wikipedia') #conn.set_client_encoding('UNICODE') cur = conn.cursor() #psycopg2.extensions.register_type(psycopg2.extensions.UNICODE, cur) cur.execute('''SELECT DISTINCT title FROM enwp_dump_20100130.page WHERE page_id='''+str(pageId)+''';''') # It is often the case that the pageId we have here (where the # template occurred) is a subpage, and hence we should pull out # the user name. # Should never have more than one match here. # # matches = 0 for r in cur: matches += 1 userName = r[0].split('/')[0] assert matches == 1, 'Too many matches for one id' for revision in indivHist: templates = revision[1] for template in templates: # Separate out the "adoptee" or "adopter" tag from the # actual person, if indicated. Remove the brace tags at # the ends, separate at the delimiter. info = template.strip('{}').split('|') assert len(info) >= 1, 'Template did not split up correctly' role = info[0] # Adopter or adoptee # adoptionPair is read as (child,parent) if role.lower() == 'adoptee': # I originally had tests in here to make sure that # template didn't have too many arguments, but a # handful of adoptees got it wrong. So I'm leaving the # test out. if len(info) == 1: childLonely.add(userName) else: adoptionPair.add((userName,info[1].encode('utf-8'))) # I'm choosing to leave this code out. If an adopter # indicates having an adoptee, but the adoptee doesn't do # it, then the adoptee clearly hasn't really bought # in. Plus, I've got junk data from adopters who pick # nonsense adoptees, like Jimbo Wales. # elif role.lower() == 'adopter': # assert len(info) <= 3, 'Template has too many arguments' # if len(info) == 1: # parentLonely.add(userName) # elif len(info) == 2 and info[1] == '': # # We have a couple of adopters who put a delimiter # # in as if they are going to specify an adoptee, # # but leave it blank # parentLonely.add(userName) # else: # adoptionPair.add((info[1].encode('utf-8'),userName)) # Now that I've got the pairs, compare each talk page to find an entry # by the other. # Turn adoptionPair into a sorted list so can get predictable results adoptionPair = list(adoptionPair) adoptionPair.sort() # Find all time stamps when the adoptee posted to the adopter page, or # the other way around outcur = conn.cursor() outcur.execute('''DROP TABLE IF EXISTS music.e01communication;''') outcur.execute('''CREATE TABLE music.e01communication (adoptee text, adopter text, written_by text,comm_time timestamp with time zone);''') outQueryTempl = '''INSERT INTO music.e01communication VALUES (%s,%s,%s, (SELECT (TIMESTAMP WITH TIME ZONE 'epoch' + %s * INTERVAL '1 second')));''' count = 0 for pair in adoptionPair: count += 1 print count,pair queryTempl = """SELECT R.timestamp FROM enwp_dump_20100130.revision R, enwp_dump_20100130.revision_editor_sane RE, enwp_dump_20100130.page P WHERE RE.revision_id = R.revision_id AND R.page_id = P.page_id AND RE.username=%s AND P.namespace=3 AND (P.title=%s OR P.title LIKE %s);""" data1 = pair[0],pair[1],pair[1]+"/%" cur = conn.cursor() cur.execute(queryTempl,data1) for r in cur: timestamp = r[0] outcur.execute(outQueryTempl,(pair[0],pair[1],"adoptee",timestamp)) data2 = pair[1],pair[0],pair[0]+"/%" cur = conn.cursor() cur.execute(queryTempl,data2) for r in cur: timestamp = r[0] outcur.execute(outQueryTempl,(pair[0],pair[1],"adopter",timestamp)) cur.execute('''CREATE INDEX e01communication_adoptee_idx on music.e01communication (adoptee);''') conn.commit() cur.close() conn.close() print 'Done'