## Last updated 8 Dec 2013 ## ## This program takes data from a locally downloaded Twitter archive ## and outputs HTML, Text, JSON, geo-coords in CSV, and best friends in csv. ## See http://blog.twitter.com/2012/12/your-twitter-archive.html ## ## It can run either as a dedicated program or as a module. ## ## Please visit https://github.com/mshea/Parse-Twitter-Archive ## for more information. ## ## This work is licensed under the Creative Commons Attribution ## NonCommercial-ShareAlike 3.0 License. You are free to share, copy, ## distribute, transmit, remix, and adapt the work as long as you attribute ## it to Michael E. Shea at http://mikeshea.net/, share the work under ## the same license, and do so for non-commercial purposes. To view a copy ## of this license, visit http://creativecommons.org/licenses/by-nc-sa/3.0/. ## import glob import json import csv import datetime import collections import re import sqlite3 from datetime import datetime from datetime import timedelta from itertools import islice, izip from collections import Counter params = { 'data_files': './data/js/tweets/*.js', 'geo_output': 'mshea_tweets_geo.csv', 'text_output': 'mshea_tweets.txt', 'json_output': 'mshea_tweets.json', 'bff_output': 'mshea_bffs.csv', 'csv_output': 'mshea_tweets.csv', 'sqlite3_output': 'mshea_tweets.sqlite3', 'html_output': 'mshea_tweets.html', 'twitter_user_id': 'mshea', } def load_data(files): items = [] files = glob.glob(files) for file in files: with open(file) as f: d = f.readlines()[1:] # Twitter's JSON first line is bogus d = "".join(d) j = json.loads(d) for tweet in j: items.append(tweet) return sorted(items, key=lambda k: k['id']) def get_bffs(d): words = [] for item in d: item_words = item['text'].split() for word in item_words: if '@' in word: words.append(word.replace(':', '').lower().encode('utf-8')) return collections.Counter(words).most_common(50) def get_bigrams(d): words = [] for item in d: item_words = re.findall('\w+', item['text']) words += item_words output = (Counter(zip(words, words[1:])).most_common(100)) for item in output: print item def get_csv_output(d): output = [('id', 'date', 'tweet')] for item in d: output.append(( item['id_str'], item['created_at'], item['text'].encode('utf-8') )) return output def get_geo(d): output = [('date', 'tweet', 'lat', 'long')] for item in d: try: lat = item['geo']['coordinates'][0] long = item['geo']['coordinates'][1] date = item['created_at'] text = item['text'].encode('utf-8') output.append((date, text, lat, long)) except: error = "no coordinates" return output def link_https_in_text(text): parsed_text = re.sub('http://[^ ,]*', lambda t: "%s" % (t.group(0), t.group(0)), text) return parsed_text def write_html(tweets, output_file): html_output = "" for item in tweets: d = datetime.strptime(item['created_at'], '%Y-%m-%d %H:%M:%S +0000') - timedelta(hours=5) day_string = d.strftime('%d %b %Y %I:%M %p') true_time_object = d + timedelta(hours=5) time_element = true_time_object.isoformat("T") text = link_https_in_text(item['text']) tweet_link = 'http://twitter.com/%s/status/%s'\ % (params['twitter_user_id'], item['id']) html_output += '