diff options
Diffstat (limited to 'bin/tweet_archive_tools.py')
-rwxr-xr-x | bin/tweet_archive_tools.py | 191 |
1 files changed, 191 insertions, 0 deletions
diff --git a/bin/tweet_archive_tools.py b/bin/tweet_archive_tools.py new file mode 100755 index 0000000..2737250 --- /dev/null +++ b/bin/tweet_archive_tools.py @@ -0,0 +1,191 @@ +## Last updated 8 Dec 2013 +## +## This program takes data from a locally downloaded Twitter archive +## and outputs HTML, Text, JSON, geo-coords in CSV, and best friends in csv. +## See http://blog.twitter.com/2012/12/your-twitter-archive.html +## +## It can run either as a dedicated program or as a module. +## +## Please visit https://github.com/mshea/Parse-Twitter-Archive +## for more information. +## +## This work is licensed under the Creative Commons Attribution +## NonCommercial-ShareAlike 3.0 License. You are free to share, copy, +## distribute, transmit, remix, and adapt the work as long as you attribute +## it to Michael E. Shea at http://mikeshea.net/, share the work under +## the same license, and do so for non-commercial purposes. To view a copy +## of this license, visit http://creativecommons.org/licenses/by-nc-sa/3.0/. +## + +import glob +import json +import csv +import datetime +import collections +import re +import sqlite3 +from datetime import datetime +from datetime import timedelta +from itertools import islice, izip +from collections import Counter + +params = { + 'data_files': './data/js/tweets/*.js', + 'geo_output': 'mshea_tweets_geo.csv', + 'text_output': 'mshea_tweets.txt', + 'json_output': 'mshea_tweets.json', + 'bff_output': 'mshea_bffs.csv', + 'csv_output': 'mshea_tweets.csv', + 'sqlite3_output': 'mshea_tweets.sqlite3', + 'html_output': 'mshea_tweets.html', + 'twitter_user_id': 'mshea', +} + + +def load_data(files): + items = [] + files = glob.glob(files) + for file in files: + with open(file) as f: + d = f.readlines()[1:] # Twitter's JSON first line is bogus + d = "".join(d) + j = json.loads(d) + for tweet in j: + items.append(tweet) + return sorted(items, key=lambda k: k['id']) + + +def get_bffs(d): + words = [] + for item in d: + item_words = item['text'].split() + for word in item_words: + if '@' in word: + words.append(word.replace(':', '').lower().encode('utf-8')) + return collections.Counter(words).most_common(50) + + +def get_bigrams(d): + words = [] + for item in d: + item_words = re.findall('\w+', item['text']) + words += item_words + output = (Counter(zip(words, words[1:])).most_common(100)) + for item in output: + print item + +def get_csv_output(d): + output = [('id', 'date', 'tweet')] + for item in d: + output.append(( + item['id_str'], + item['created_at'], + item['text'].encode('utf-8') + )) + return output + + +def get_geo(d): + output = [('date', 'tweet', 'lat', 'long')] + for item in d: + try: + lat = item['geo']['coordinates'][0] + long = item['geo']['coordinates'][1] + date = item['created_at'] + text = item['text'].encode('utf-8') + output.append((date, text, lat, long)) + except: + error = "no coordinates" + return output + + +def link_https_in_text(text): + parsed_text = re.sub('http://[^ ,]*', + lambda t: "<a href='%s'>%s</a>" % + (t.group(0), t.group(0)), text) + return parsed_text + + +def write_html(tweets, output_file): + html_output = "" + for item in tweets: + d = datetime.strptime(item['created_at'], + '%Y-%m-%d %H:%M:%S +0000') + - timedelta(hours=5) + day_string = d.strftime('%d %b %Y %I:%M %p') + true_time_object = d + timedelta(hours=5) + time_element = true_time_object.isoformat("T") + text = link_https_in_text(item['text']) + tweet_link = 'http://twitter.com/%s/status/%s'\ + % (params['twitter_user_id'], item['id']) + html_output += '<li id=%s>%s - <a href="%s">'\ + '<time datetime="%s">%s</time></a></li>\n' \ + % (item['id'], + text, + tweet_link, + time_element, + day_string) + with open(output_file, "w") as f: + f.write('<!DOCTYPE html>\n' + '<title>Twitter Archive Output</title>\n' + '<ul>\n') + f.write(html_output.encode('utf-8')) + f.write('</ul>') + + +def write_sqlite3(json_input, output_file): + conn = sqlite3.connect(output_file) + c = conn.cursor() + try: + c.execute('select count(*) from tweets') + except: + c.execute('CREATE TABLE tweets' + '(id int not null primary key, ' + 'created_at text, text text)') + conn.commit() + data_to_write = [] + for item in json_input: + data_to_write.append((int(item['id_str']), + item['created_at'], + item['text'])) + c.executemany('INSERT OR REPLACE ' + 'INTO tweets VALUES (?,?,?);', + data_to_write) + conn.commit() + + +def write_text(tweets, output_file): + text_output = '' + for item in tweets: + text_output += '%s\n%s\n%s\n\n' % (item['id'], + item['created_at'], + item['text']) + with open(output_file, "w") as f: + f.write(text_output.encode('utf-8')) + + +def write_csv(d, csv_file): + with open(csv_file, 'w') as f: + writer = csv.writer(f) + writer.writerows(d) + + +def write_json(json_data, output_file): + with open(output_file, 'w') as f: + f.write(json.dumps(json_data, indent=4)) + + +def main(): + d = load_data(params['data_files']) + #get_bigrams(d) + write_csv(get_bffs(d), params['bff_output']) + write_csv(get_geo(d), params['geo_output']) + write_csv(get_csv_output(d), params['csv_output']) + write_html(d, params['html_output']) + write_text(d, params['text_output']) + write_json(d, params['json_output']) + write_sqlite3(d, params['sqlite3_output']) + + +if __name__ == "__main__": + main() |