diff options
Diffstat (limited to 'app/books/kindleparser.py')
-rw-r--r-- | app/books/kindleparser.py | 114 |
1 files changed, 78 insertions, 36 deletions
diff --git a/app/books/kindleparser.py b/app/books/kindleparser.py index 44a4686..c56499e 100644 --- a/app/books/kindleparser.py +++ b/app/books/kindleparser.py @@ -5,48 +5,89 @@ This script then parses that json and tries to load it into the django database """ import json import datetime -from django.core.exceptions import ObjectDoesNotExist +from django.core.exceptions import ObjectDoesNotExist, MultipleObjectsReturned from books.models import Book, BookHighlight -def parse_kindle_clippings(path): - json_data = open(path) - data = json.load(json_data) - for item in data: - for clip in item['clippings']: - if clip["type"] != "Bookmark": - book = clip['title'] - clip_date = datetime.datetime.strptime(clip['date'], "%Y-%m-%dT%H:%M:%S.000Z") - try: - body_markdown = clip['content'] - except KeyError: - body_markdown = "" - try: - location = clip['locationRange'] - except: - location = 0 - try: - page = int(clip['pageRange'][0]) - except: - page = 0 - try: - author_name = clip['author'] - except KeyError: - author_name = '' - try: - #see if we already have this book: - row = Book.objects.get(title=clip['title']) - except ObjectDoesNotExist: - b, created = Book.objects.get_or_create( - title=clip['title'], - author_name=author_name, - read_date=clip_date - ) +class KindleClippingsParser(object): + + def __init__(self, path): + self.path = path + + def parse(self): + with open(self.path, 'r') as f: + # Individual highlights within clippings are separated by ========== + old_title = '' + for highlight in f.read().split("=========="): + page = None + location = None + # For each highlight, we split it into the lines + lines = highlight.split("\n")[1:] + # Don't try to write if we have no body + if len(lines) < 3 or lines[3] == "": + continue + titleauth = lines[0] + if titleauth[0] == "\ufeff": + titleauth = titleauth[1:] try: + titleauth = titleauth.split("(") + title = titleauth[0] + author = titleauth[1].split(")")[0] + except IndexError: + title = str(titleauth) + author = str(titleauth) + try: + page = lines[1].split("- Your Highlight on page ")[1].split(" |")[0] + except IndexError: + try: + location = lines[1].split("- Your Highlight on Location ")[1].split(" |")[0] + except IndexError: + page = lines[1].split("- Your Note on page ")[1].split(" |")[0] + date = lines[1] + date = date.split("| Added on ")[1].split(",", 1)[1] + # date string looks like: "November 23, 2020 11:22:41 PM" + date_time_obj = datetime.datetime.strptime(date.strip(), '%B %d, %Y %H:%M:%S %p') + #print(date, date_time_obj) + #print(title) + #print(author) + #if page: + # print(page) + #if location: + # print(location) + #print(lines[3]) + #print("---------------") + if old_title != title: + self.create_highlight(title, date_time_obj) + old_title = title + + def create_highlight(self, title, date): + print(title) + try: + book = Book.objects.get(kindle_title=title) + print("success") + except ObjectDoesNotExist: + try: + search_title = " ".join(t for t in title.split(" ")[:3]) + book = Book.objects.get(title__icontains=search_title) + print(book) + except(ObjectDoesNotExist, MultipleObjectsReturned): + book, created = Book.objects.get_or_create( + kindle_title=title, + title=title, + read_date=date, + body_markdown = 'tk', + ) + if created: + print(book) + """ + book, created = Book.objects.get_or_create( + kindle_title=title, + author_name=author_name, + ) + try: #see if we already this highlight - bh = BookHighlight.objects.get( + h = BookHighlight.objects.get( book__title=clip['title'], - date_added=clip_date ) #print(bh.book.title) print(location) @@ -66,3 +107,4 @@ def parse_kindle_clippings(path): date_added=clip_date, body_markdown=body_markdown ) + """ |