summaryrefslogtreecommitdiff
path: root/app/blog/parse.py
blob: a1e30565e0c1ca7855b564f75438ddcf88fa539e (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
#!/usr/bin/python
import os
import datetime
from dateutil.parser import parse as dateparser
from os.path import abspath, dirname
from django.core.exceptions import ObjectDoesNotExist
from blog.models import Entry

from django.conf import settings


def parse_file(filepath):
    data = {}
    contents = open(filepath).read()
    raw = contents.splitlines()
    for line in raw[1:]:
        if line == '---':
            break
        else:
            k, v = line.split(':', 1)
            data[k.strip()] = v.strip()
    body = "\n".join(line.strip() for line in raw[1:])
    data["body_markdown"] = body.split('---')[1]
    return data

"""
    now I need a function to query the db for the title and date
    if there's no entry then it's new and we add it and publish
    What about edits though? Crap, edits. That means we need to check lastmod
    and that's notoriously inaccurate. damn.
from blog.parse import *
crawl_dir()

"""


def crawl_dir():
    file_root = settings.POSTS_DIR
    file_list = os.listdir(file_root)
    file_list = filter(lambda item: not (item.startswith('README') or item.startswith('updategithub.php') or item.startswith('.') or item.endswith('~')), file_list)
    for f in file_list:
        fpath = file_root + "/" + f
        last_mod = datetime.datetime.fromtimestamp(os.path.getmtime(fpath))
        last_run = datetime.datetime.fromtimestamp(os.path.getmtime(abspath(dirname(__file__)) + '/last_run'))
        if last_mod > last_run:
            print "needs an update"
            data = parse_file(fpath)
            date = dateparser(data['pub_date'])
            try:
                row = Entry.objects.get(title=str(data['title']), pub_date=date)
                print row.title, date
            except ObjectDoesNotExist:
                print data['title'] + str(date) + " = not found"
    last_mod_dump = open(abspath(dirname(__file__)) + '/last_run', 'w')
    print last_mod_dump
    print >> last_mod_dump, str(datetime.datetime.now())