app/books/kindleparser.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96

"""
kindle My Clippings.txt parsed to json by:
klip: https://www.npmjs.org/package/klip
This script then parses that json and tries to load it into the django database
"""
import json
import datetime
from django.core.exceptions import ObjectDoesNotExist, MultipleObjectsReturned
from books.models import Book, BookHighlight


class KindleClippingsParser(object):

    def __init__(self, path):
        self.path = path

    def parse(self):
        with open(self.path, 'r') as f:
            # Individual highlights within clippings are separated by ==========
            for highlight in f.read().split("=========="):
                page = None
                location = None
                # For each highlight, we split it into the lines
                lines = highlight.split("\n")[1:]
                # Don't try to write if we have no body
                if len(lines) < 3 or lines[3] == "":
                    continue
                titleauth = lines[0]
                if titleauth[0] == "\ufeff":
                    titleauth = titleauth[1:]
                try:
                    titleauth = titleauth.split("(")
                    title = titleauth[0]
                    author = titleauth[1].split(")")[0]
                except IndexError:
                    title = str(titleauth)
                    author = str(titleauth)
                try: 
                    page = lines[1].split("- Your Highlight on page ")[1].split(" |")[0]
                except IndexError:
                    try:
                        page = lines[1].split("- Your Highlight on Location ")[1].split(" |")[0]
                    except IndexError:
                        page = lines[1].split("- Your Note on page ")[1].split(" |")[0]
                date = lines[1]
                date = date.split("| Added on ")[1].split(",", 1)[1]
                # date string looks like: "November 23, 2020 11:22:41 PM"
                date_time_obj = datetime.datetime.strptime(date.strip(), '%B %d, %Y %H:%M:%S %p')
                self.create_highlight(title, date_time_obj, page, lines[3])

    def create_highlight(self, title, date, page, clip):
        print(title)
        try:
            book = Book.objects.get(kindle_title=title)
            print("success")
        except ObjectDoesNotExist:
            try:
                search_title = " ".join(t for t in title.split(" ")[:3])
                book = Book.objects.get(title__icontains=search_title)
                print(book)
            except(ObjectDoesNotExist, MultipleObjectsReturned):
                book, created = Book.objects.get_or_create(
                    kindle_title=title,
                    title=title,
                    read_date=date,
                    body_markdown = 'tk',
                )
                if created:
                    print(book)
        try:
            #see if we already this highlight
            h = BookHighlight.objects.get(
                book__title=title,
                date_highlighted=date,
            )
        #if we don't create a new book highlight
        except ObjectDoesNotExist:
            print("hightlight: %s" % book.title)
            print("page : %s" % page)
            print("on date: %s" % date)
            print("quote: %s" % clip)
            print("--------------")
            bh, created = BookHighlight.objects.get_or_create(
                book=book,
                page=page,
                date_highlighted=date,
                body_markdown=clip
            )