1
0
Fork 0
mirror of synced 2024-07-01 21:11:09 -04:00
ultimate-vim/sources_non_forked/vim-notes/misc/notes/search-notes.py
Maksim Pecherskiy 2deb035254 Cleaning deps.
2014-08-07 19:42:41 -04:00

313 lines
12 KiB
Python
Executable file

#!/usr/bin/env python
# Python script for fast text file searching using keyword index on disk.
#
# Author: Peter Odding <peter@peterodding.com>
# Last Change: September 2, 2013
# URL: http://peterodding.com/code/vim/notes/
# License: MIT
#
# This Python script can be used by the notes.vim plug-in to perform fast
# keyword searches in the user's notes. It has two advantages over just
# using Vim's internal :vimgrep command to search all of the user's notes:
#
# - Very large notes don't slow searching down so much;
# - Hundreds of notes can be searched in less than a second.
#
# The keyword index is a Python dictionary that's persisted using the pickle
# module. The structure of the dictionary may seem very naive but it's quite
# fast. Also the pickle protocol makes sure repeating strings are stored only
# once, so it's not as bad as it may appear at first sight :-).
#
# For more information about the Vim plug-in see http://peterodding.com/code/vim/notes/.
"""
Usage: search-notes.py [OPTIONS] KEYWORD...
Search one or more directories of plain text files using a full text index,
updated automatically during each invocation of the program.
Valid options include:
-i, --ignore-case ignore case of keyword(s)
-l, --list=SUBSTR list keywords matching substring
-d, --database=FILE set path to keywords index file
-n, --notes=DIR set directory with user notes (can be repeated)
-e, --encoding=NAME set character encoding of notes
-v, --verbose make more noise
-h, --help show this message and exit
For more information see http://peterodding.com/code/vim/notes/
"""
# Standard library modules.
import fnmatch
import getopt
import logging
import os
import re
import sys
import time
# Load the faster C variant of the pickle module where possible, but
# fall back to the Python implementation that's always available.
try:
import cPickle as pickle
except ImportError:
import pickle
# Try to import the Levenshtein module, don't error out if it's not installed.
try:
import Levenshtein
levenshtein_supported = True
except ImportError:
levenshtein_supported = False
# The version of the index format that's supported by this revision of the
# `search-notes.py' script; if an existing index file is found with an
# unsupported version, the script knows that it should rebuild the index.
INDEX_VERSION = 2
class NotesIndex:
def __init__(self):
''' Entry point to the notes search. '''
global_timer = Timer()
self.init_logging()
keywords = self.parse_args()
self.load_index()
self.update_index()
if self.dirty:
self.save_index()
print "Python works fine!"
if self.keyword_filter is not None:
self.list_keywords(self.keyword_filter)
self.logger.debug("Finished listing keywords in %s", global_timer)
else:
matches = self.search_index(keywords)
if matches:
print '\n'.join(sorted(matches))
self.logger.debug("Finished searching index in %s", global_timer)
def init_logging(self):
''' Initialize the logging subsystem. '''
self.logger = logging.getLogger('search-notes')
self.logger.addHandler(logging.StreamHandler(sys.stderr))
if os.isatty(0):
self.logger.setLevel(logging.INFO)
def parse_args(self):
''' Parse the command line arguments. '''
try:
opts, keywords = getopt.getopt(sys.argv[1:], 'il:d:n:e:vh',
['ignore-case', 'list=', 'database=', 'notes=', 'encoding=', 'verbose', 'help'])
except getopt.GetoptError, error:
print str(error)
self.usage()
sys.exit(2)
# Define the command line option defaults.
self.database_file = '~/.vim/misc/notes/index.pickle'
self.user_directories = ['~/.vim/misc/notes/user/']
self.character_encoding = 'UTF-8'
self.case_sensitive = True
self.keyword_filter = None
# Map command line options to variables.
for opt, arg in opts:
if opt in ('-i', '--ignore-case'):
self.case_sensitive = False
self.logger.debug("Disabling case sensitivity")
elif opt in ('-l', '--list'):
self.keyword_filter = arg.strip().lower()
elif opt in ('-d', '--database'):
self.database_file = arg
elif opt in ('-n', '--notes'):
self.user_directories.append(arg)
elif opt in ('-e', '--encoding'):
self.character_encoding = arg
elif opt in ('-v', '--verbose'):
self.logger.setLevel(logging.DEBUG)
elif opt in ('-h', '--help'):
self.usage()
sys.exit(0)
else:
assert False, "Unhandled option"
self.logger.debug("Index file: %s", self.database_file)
self.logger.debug("Notes directories: %r", self.user_directories)
self.logger.debug("Character encoding: %s", self.character_encoding)
if self.keyword_filter is not None:
self.keyword_filter = self.decode(self.keyword_filter)
# Canonicalize pathnames, check validity.
self.database_file = self.munge_path(self.database_file)
self.user_directories = map(self.munge_path, self.user_directories)
self.user_directories = filter(os.path.isdir, self.user_directories)
if not any(os.path.isdir(p) for p in self.user_directories):
sys.stderr.write("None of the notes directories exist!\n")
sys.exit(1)
# Return tokenized keyword arguments.
return [self.normalize(k) for k in self.tokenize(' '.join(keywords))]
def load_index(self):
''' Load the keyword index or start with an empty one. '''
try:
load_timer = Timer()
self.logger.debug("Loading index from %s ..", self.database_file)
with open(self.database_file) as handle:
self.index = pickle.load(handle)
self.logger.debug("Format version of index loaded from disk: %i", self.index['version'])
assert self.index['version'] == INDEX_VERSION, "Incompatible index format detected!"
self.first_use = False
self.dirty = False
self.logger.debug("Loaded %i notes from index in %s", len(self.index['files']), load_timer)
except Exception, e:
self.logger.warn("Failed to load index from file: %s", e)
self.first_use = True
self.dirty = True
self.index = {'keywords': {}, 'files': {}, 'version': INDEX_VERSION}
def save_index(self):
''' Save the keyword index to disk. '''
save_timer = Timer()
with open(self.database_file, 'w') as handle:
pickle.dump(self.index, handle)
self.logger.debug("Saved index to disk in %s", save_timer)
def update_index(self):
''' Update the keyword index by scanning the notes directory. '''
update_timer = Timer()
# First we find the filenames and last modified times of the notes on disk.
notes_on_disk = {}
last_count = 0
for directory in self.user_directories:
for filename in os.listdir(directory):
# Vim swap files are ignored.
if (filename != '.swp' and not fnmatch.fnmatch(filename, '.s??')
and not fnmatch.fnmatch(filename, '.*.s??')):
abspath = os.path.join(directory, filename)
if os.path.isfile(abspath):
notes_on_disk[abspath] = os.path.getmtime(abspath)
self.logger.info("Found %i notes in %s ..", len(notes_on_disk) - last_count, directory)
last_count = len(notes_on_disk)
# Check for updated and/or deleted notes since the last run?
if not self.first_use:
for filename in self.index['files'].keys():
if filename not in notes_on_disk:
# Forget a deleted note.
self.delete_note(filename)
else:
# Check whether previously seen note has changed?
last_modified_on_disk = notes_on_disk[filename]
last_modified_in_db = self.index['files'][filename]
if last_modified_on_disk > last_modified_in_db:
self.delete_note(filename)
self.add_note(filename, last_modified_on_disk)
# Already checked this note, we can forget about it.
del notes_on_disk[filename]
# Add new notes to index.
for filename, last_modified in notes_on_disk.iteritems():
self.add_note(filename, last_modified)
self.logger.debug("Updated index in %s", update_timer)
def add_note(self, filename, last_modified):
''' Add a note to the index (assumes the note is not already indexed). '''
self.logger.info("Adding file to index: %s", filename)
self.index['files'][filename] = last_modified
with open(filename) as handle:
for kw in self.tokenize(handle.read()):
if kw not in self.index['keywords']:
self.index['keywords'][kw] = [filename]
else:
self.index['keywords'][kw].append(filename)
self.dirty = True
def delete_note(self, filename):
''' Remove a note from the index. '''
self.logger.info("Removing file from index: %s", filename)
del self.index['files'][filename]
for kw in self.index['keywords']:
self.index['keywords'][kw] = [x for x in self.index['keywords'][kw] if x != filename]
self.dirty = True
def search_index(self, keywords):
''' Return names of files containing all of the given keywords. '''
matches = None
normalized_db_keywords = [(k, self.normalize(k)) for k in self.index['keywords']]
for usr_kw in keywords:
submatches = set()
for original_db_kw, normalized_db_kw in normalized_db_keywords:
# Yes I'm using a nested for loop over all keywords in the index. If
# I really have to I'll probably come up with something more
# efficient, but really it doesn't seem to be needed -- I have over
# 850 notes (about 8 MB) and 25000 keywords and it's plenty fast.
if usr_kw in normalized_db_kw:
submatches.update(self.index['keywords'][original_db_kw])
if matches is None:
matches = submatches
else:
matches &= submatches
return list(matches) if matches else []
def list_keywords(self, substring, limit=25):
''' Print all (matching) keywords to standard output. '''
decorated = []
substring = self.normalize(substring)
for kw, filenames in self.index['keywords'].iteritems():
normalized_kw = self.normalize(kw)
if substring in normalized_kw:
if levenshtein_supported:
decorated.append((Levenshtein.distance(normalized_kw, substring), -len(filenames), kw))
else:
decorated.append((-len(filenames), kw))
decorated.sort()
selection = [d[-1] for d in decorated[:limit]]
print self.encode(u'\n'.join(selection))
def tokenize(self, text):
''' Tokenize a string into a list of normalized, unique keywords. '''
words = set()
text = self.decode(text)
for word in re.findall(r'\w+', text, re.UNICODE):
word = word.strip()
if word != '' and not word.isspace() and len(word) >= 2:
words.add(word)
return words
def normalize(self, keyword):
''' Normalize the case of a keyword if configured to do so. '''
return keyword if self.case_sensitive else keyword.lower()
def encode(self, text):
''' Encode a string in the user's preferred character encoding. '''
return text.encode(self.character_encoding, 'ignore')
def decode(self, text):
''' Decode a string in the user's preferred character encoding. '''
return text.decode(self.character_encoding, 'ignore')
def munge_path(self, path):
''' Canonicalize user-defined path, making it absolute. '''
return os.path.abspath(os.path.expanduser(path))
def usage(self):
print __doc__.strip()
class Timer:
"""
Easy to use timer to keep track of long during operations.
"""
def __init__(self):
self.start_time = time.time()
def __str__(self):
return "%.2f seconds" % self.elapsed_time
@property
def elapsed_time(self):
return time.time() - self.start_time
if __name__ == '__main__':
NotesIndex()
# vim: ts=2 sw=2 et