312 lines
12 KiB
Python
Executable file
312 lines
12 KiB
Python
Executable file
#!/usr/bin/env python
|
|
|
|
# Python script for fast text file searching using keyword index on disk.
|
|
#
|
|
# Author: Peter Odding <peter@peterodding.com>
|
|
# Last Change: September 2, 2013
|
|
# URL: http://peterodding.com/code/vim/notes/
|
|
# License: MIT
|
|
#
|
|
# This Python script can be used by the notes.vim plug-in to perform fast
|
|
# keyword searches in the user's notes. It has two advantages over just
|
|
# using Vim's internal :vimgrep command to search all of the user's notes:
|
|
#
|
|
# - Very large notes don't slow searching down so much;
|
|
# - Hundreds of notes can be searched in less than a second.
|
|
#
|
|
# The keyword index is a Python dictionary that's persisted using the pickle
|
|
# module. The structure of the dictionary may seem very naive but it's quite
|
|
# fast. Also the pickle protocol makes sure repeating strings are stored only
|
|
# once, so it's not as bad as it may appear at first sight :-).
|
|
#
|
|
# For more information about the Vim plug-in see http://peterodding.com/code/vim/notes/.
|
|
|
|
"""
|
|
Usage: search-notes.py [OPTIONS] KEYWORD...
|
|
|
|
Search one or more directories of plain text files using a full text index,
|
|
updated automatically during each invocation of the program.
|
|
|
|
Valid options include:
|
|
|
|
-i, --ignore-case ignore case of keyword(s)
|
|
-l, --list=SUBSTR list keywords matching substring
|
|
-d, --database=FILE set path to keywords index file
|
|
-n, --notes=DIR set directory with user notes (can be repeated)
|
|
-e, --encoding=NAME set character encoding of notes
|
|
-v, --verbose make more noise
|
|
-h, --help show this message and exit
|
|
|
|
For more information see http://peterodding.com/code/vim/notes/
|
|
"""
|
|
|
|
# Standard library modules.
|
|
import fnmatch
|
|
import getopt
|
|
import logging
|
|
import os
|
|
import re
|
|
import sys
|
|
import time
|
|
|
|
# Load the faster C variant of the pickle module where possible, but
|
|
# fall back to the Python implementation that's always available.
|
|
try:
|
|
import cPickle as pickle
|
|
except ImportError:
|
|
import pickle
|
|
|
|
# Try to import the Levenshtein module, don't error out if it's not installed.
|
|
try:
|
|
import Levenshtein
|
|
levenshtein_supported = True
|
|
except ImportError:
|
|
levenshtein_supported = False
|
|
|
|
# The version of the index format that's supported by this revision of the
|
|
# `search-notes.py' script; if an existing index file is found with an
|
|
# unsupported version, the script knows that it should rebuild the index.
|
|
INDEX_VERSION = 2
|
|
|
|
class NotesIndex:
|
|
|
|
def __init__(self):
|
|
''' Entry point to the notes search. '''
|
|
global_timer = Timer()
|
|
self.init_logging()
|
|
keywords = self.parse_args()
|
|
self.load_index()
|
|
self.update_index()
|
|
if self.dirty:
|
|
self.save_index()
|
|
print "Python works fine!"
|
|
if self.keyword_filter is not None:
|
|
self.list_keywords(self.keyword_filter)
|
|
self.logger.debug("Finished listing keywords in %s", global_timer)
|
|
else:
|
|
matches = self.search_index(keywords)
|
|
if matches:
|
|
print '\n'.join(sorted(matches))
|
|
self.logger.debug("Finished searching index in %s", global_timer)
|
|
|
|
def init_logging(self):
|
|
''' Initialize the logging subsystem. '''
|
|
self.logger = logging.getLogger('search-notes')
|
|
self.logger.addHandler(logging.StreamHandler(sys.stderr))
|
|
if os.isatty(0):
|
|
self.logger.setLevel(logging.INFO)
|
|
|
|
def parse_args(self):
|
|
''' Parse the command line arguments. '''
|
|
try:
|
|
opts, keywords = getopt.getopt(sys.argv[1:], 'il:d:n:e:vh',
|
|
['ignore-case', 'list=', 'database=', 'notes=', 'encoding=', 'verbose', 'help'])
|
|
except getopt.GetoptError, error:
|
|
print str(error)
|
|
self.usage()
|
|
sys.exit(2)
|
|
# Define the command line option defaults.
|
|
self.database_file = '~/.vim/misc/notes/index.pickle'
|
|
self.user_directories = ['~/.vim/misc/notes/user/']
|
|
self.character_encoding = 'UTF-8'
|
|
self.case_sensitive = True
|
|
self.keyword_filter = None
|
|
# Map command line options to variables.
|
|
for opt, arg in opts:
|
|
if opt in ('-i', '--ignore-case'):
|
|
self.case_sensitive = False
|
|
self.logger.debug("Disabling case sensitivity")
|
|
elif opt in ('-l', '--list'):
|
|
self.keyword_filter = arg.strip().lower()
|
|
elif opt in ('-d', '--database'):
|
|
self.database_file = arg
|
|
elif opt in ('-n', '--notes'):
|
|
self.user_directories.append(arg)
|
|
elif opt in ('-e', '--encoding'):
|
|
self.character_encoding = arg
|
|
elif opt in ('-v', '--verbose'):
|
|
self.logger.setLevel(logging.DEBUG)
|
|
elif opt in ('-h', '--help'):
|
|
self.usage()
|
|
sys.exit(0)
|
|
else:
|
|
assert False, "Unhandled option"
|
|
self.logger.debug("Index file: %s", self.database_file)
|
|
self.logger.debug("Notes directories: %r", self.user_directories)
|
|
self.logger.debug("Character encoding: %s", self.character_encoding)
|
|
if self.keyword_filter is not None:
|
|
self.keyword_filter = self.decode(self.keyword_filter)
|
|
# Canonicalize pathnames, check validity.
|
|
self.database_file = self.munge_path(self.database_file)
|
|
self.user_directories = map(self.munge_path, self.user_directories)
|
|
self.user_directories = filter(os.path.isdir, self.user_directories)
|
|
if not any(os.path.isdir(p) for p in self.user_directories):
|
|
sys.stderr.write("None of the notes directories exist!\n")
|
|
sys.exit(1)
|
|
# Return tokenized keyword arguments.
|
|
return [self.normalize(k) for k in self.tokenize(' '.join(keywords))]
|
|
|
|
def load_index(self):
|
|
''' Load the keyword index or start with an empty one. '''
|
|
try:
|
|
load_timer = Timer()
|
|
self.logger.debug("Loading index from %s ..", self.database_file)
|
|
with open(self.database_file) as handle:
|
|
self.index = pickle.load(handle)
|
|
self.logger.debug("Format version of index loaded from disk: %i", self.index['version'])
|
|
assert self.index['version'] == INDEX_VERSION, "Incompatible index format detected!"
|
|
self.first_use = False
|
|
self.dirty = False
|
|
self.logger.debug("Loaded %i notes from index in %s", len(self.index['files']), load_timer)
|
|
except Exception, e:
|
|
self.logger.warn("Failed to load index from file: %s", e)
|
|
self.first_use = True
|
|
self.dirty = True
|
|
self.index = {'keywords': {}, 'files': {}, 'version': INDEX_VERSION}
|
|
|
|
def save_index(self):
|
|
''' Save the keyword index to disk. '''
|
|
save_timer = Timer()
|
|
with open(self.database_file, 'w') as handle:
|
|
pickle.dump(self.index, handle)
|
|
self.logger.debug("Saved index to disk in %s", save_timer)
|
|
|
|
def update_index(self):
|
|
''' Update the keyword index by scanning the notes directory. '''
|
|
update_timer = Timer()
|
|
# First we find the filenames and last modified times of the notes on disk.
|
|
notes_on_disk = {}
|
|
last_count = 0
|
|
for directory in self.user_directories:
|
|
for filename in os.listdir(directory):
|
|
# Vim swap files are ignored.
|
|
if (filename != '.swp' and not fnmatch.fnmatch(filename, '.s??')
|
|
and not fnmatch.fnmatch(filename, '.*.s??')):
|
|
abspath = os.path.join(directory, filename)
|
|
if os.path.isfile(abspath):
|
|
notes_on_disk[abspath] = os.path.getmtime(abspath)
|
|
self.logger.info("Found %i notes in %s ..", len(notes_on_disk) - last_count, directory)
|
|
last_count = len(notes_on_disk)
|
|
# Check for updated and/or deleted notes since the last run?
|
|
if not self.first_use:
|
|
for filename in self.index['files'].keys():
|
|
if filename not in notes_on_disk:
|
|
# Forget a deleted note.
|
|
self.delete_note(filename)
|
|
else:
|
|
# Check whether previously seen note has changed?
|
|
last_modified_on_disk = notes_on_disk[filename]
|
|
last_modified_in_db = self.index['files'][filename]
|
|
if last_modified_on_disk > last_modified_in_db:
|
|
self.delete_note(filename)
|
|
self.add_note(filename, last_modified_on_disk)
|
|
# Already checked this note, we can forget about it.
|
|
del notes_on_disk[filename]
|
|
# Add new notes to index.
|
|
for filename, last_modified in notes_on_disk.iteritems():
|
|
self.add_note(filename, last_modified)
|
|
self.logger.debug("Updated index in %s", update_timer)
|
|
|
|
def add_note(self, filename, last_modified):
|
|
''' Add a note to the index (assumes the note is not already indexed). '''
|
|
self.logger.info("Adding file to index: %s", filename)
|
|
self.index['files'][filename] = last_modified
|
|
with open(filename) as handle:
|
|
for kw in self.tokenize(handle.read()):
|
|
if kw not in self.index['keywords']:
|
|
self.index['keywords'][kw] = [filename]
|
|
else:
|
|
self.index['keywords'][kw].append(filename)
|
|
self.dirty = True
|
|
|
|
def delete_note(self, filename):
|
|
''' Remove a note from the index. '''
|
|
self.logger.info("Removing file from index: %s", filename)
|
|
del self.index['files'][filename]
|
|
for kw in self.index['keywords']:
|
|
self.index['keywords'][kw] = [x for x in self.index['keywords'][kw] if x != filename]
|
|
self.dirty = True
|
|
|
|
def search_index(self, keywords):
|
|
''' Return names of files containing all of the given keywords. '''
|
|
matches = None
|
|
normalized_db_keywords = [(k, self.normalize(k)) for k in self.index['keywords']]
|
|
for usr_kw in keywords:
|
|
submatches = set()
|
|
for original_db_kw, normalized_db_kw in normalized_db_keywords:
|
|
# Yes I'm using a nested for loop over all keywords in the index. If
|
|
# I really have to I'll probably come up with something more
|
|
# efficient, but really it doesn't seem to be needed -- I have over
|
|
# 850 notes (about 8 MB) and 25000 keywords and it's plenty fast.
|
|
if usr_kw in normalized_db_kw:
|
|
submatches.update(self.index['keywords'][original_db_kw])
|
|
if matches is None:
|
|
matches = submatches
|
|
else:
|
|
matches &= submatches
|
|
return list(matches) if matches else []
|
|
|
|
def list_keywords(self, substring, limit=25):
|
|
''' Print all (matching) keywords to standard output. '''
|
|
decorated = []
|
|
substring = self.normalize(substring)
|
|
for kw, filenames in self.index['keywords'].iteritems():
|
|
normalized_kw = self.normalize(kw)
|
|
if substring in normalized_kw:
|
|
if levenshtein_supported:
|
|
decorated.append((Levenshtein.distance(normalized_kw, substring), -len(filenames), kw))
|
|
else:
|
|
decorated.append((-len(filenames), kw))
|
|
decorated.sort()
|
|
selection = [d[-1] for d in decorated[:limit]]
|
|
print self.encode(u'\n'.join(selection))
|
|
|
|
def tokenize(self, text):
|
|
''' Tokenize a string into a list of normalized, unique keywords. '''
|
|
words = set()
|
|
text = self.decode(text)
|
|
for word in re.findall(r'\w+', text, re.UNICODE):
|
|
word = word.strip()
|
|
if word != '' and not word.isspace() and len(word) >= 2:
|
|
words.add(word)
|
|
return words
|
|
|
|
def normalize(self, keyword):
|
|
''' Normalize the case of a keyword if configured to do so. '''
|
|
return keyword if self.case_sensitive else keyword.lower()
|
|
|
|
def encode(self, text):
|
|
''' Encode a string in the user's preferred character encoding. '''
|
|
return text.encode(self.character_encoding, 'ignore')
|
|
|
|
def decode(self, text):
|
|
''' Decode a string in the user's preferred character encoding. '''
|
|
return text.decode(self.character_encoding, 'ignore')
|
|
|
|
def munge_path(self, path):
|
|
''' Canonicalize user-defined path, making it absolute. '''
|
|
return os.path.abspath(os.path.expanduser(path))
|
|
|
|
def usage(self):
|
|
print __doc__.strip()
|
|
|
|
class Timer:
|
|
|
|
"""
|
|
Easy to use timer to keep track of long during operations.
|
|
"""
|
|
|
|
def __init__(self):
|
|
self.start_time = time.time()
|
|
|
|
def __str__(self):
|
|
return "%.2f seconds" % self.elapsed_time
|
|
|
|
@property
|
|
def elapsed_time(self):
|
|
return time.time() - self.start_time
|
|
|
|
if __name__ == '__main__':
|
|
NotesIndex()
|
|
|
|
# vim: ts=2 sw=2 et
|