1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
|
"""
kindle My Clippings.txt parsed to json by:
klip: https://www.npmjs.org/package/klip
This script then parses that json and tries to load it into the django database
"""
import json
import datetime
from django.core.exceptions import ObjectDoesNotExist, MultipleObjectsReturned
from books.models import Book, BookHighlight
class KindleClippingsParser(object):
def __init__(self, path):
self.path = path
def parse(self):
with open(self.path, 'r') as f:
# Individual highlights within clippings are separated by ==========
for highlight in f.read().split("=========="):
page = None
location = None
# For each highlight, we split it into the lines
lines = highlight.split("\n")[1:]
# Don't try to write if we have no body
if len(lines) < 3 or lines[3] == "":
continue
titleauth = lines[0]
if titleauth[0] == "\ufeff":
titleauth = titleauth[1:]
try:
titleauth = titleauth.split("(")
title = titleauth[0]
author = titleauth[1].split(")")[0]
except IndexError:
title = str(titleauth)
author = str(titleauth)
try:
page = lines[1].split("- Your Highlight on page ")[1].split(" |")[0]
except IndexError:
try:
page = lines[1].split("- Your Highlight on Location ")[1].split(" |")[0]
except IndexError:
page = lines[1].split("- Your Note on page ")[1].split(" |")[0]
date = lines[1]
date = date.split("| Added on ")[1].split(",", 1)[1]
# date string looks like: "November 23, 2020 11:22:41 PM"
date_time_obj = datetime.datetime.strptime(date.strip(), '%B %d, %Y %H:%M:%S %p')
self.create_highlight(title, date_time_obj, page, lines[3])
def create_highlight(self, title, date, page, clip):
print(title)
try:
book = Book.objects.get(kindle_title=title)
print("success")
except ObjectDoesNotExist:
try:
search_title = " ".join(t for t in title.split(" ")[:3])
book = Book.objects.get(title__icontains=search_title)
print(book)
except(ObjectDoesNotExist, MultipleObjectsReturned):
book, created = Book.objects.get_or_create(
kindle_title=title,
title=title,
read_date=date,
body_markdown = 'tk',
)
if created:
print(book)
try:
#see if we already this highlight
h = BookHighlight.objects.get(
book__title=title,
date_highlighted=date,
)
#if we don't create a new book highlight
except ObjectDoesNotExist:
print("hightlight: %s" % book.title)
print("page : %s" % page)
print("on date: %s" % date)
print("quote: %s" % clip)
print("--------------")
bh, created = BookHighlight.objects.get_or_create(
book=book,
page=page,
date_highlighted=date,
body_markdown=clip
)
|