523 lines
20 KiB
Common Lisp
523 lines
20 KiB
Common Lisp
;;;
|
|
;;; This code was written by:
|
|
;;;
|
|
;;; Lawrence E. Freil <lef@freil.com>
|
|
;;; National Science Center Foundation
|
|
;;; Augusta, Georgia 30909
|
|
;;;
|
|
;;; This program was released into the public domain on 2005-08-31.
|
|
;;; (See the slime-devel mailing list archive for details.)
|
|
;;;
|
|
;;; nregex.lisp - My 4/8/92 attempt at a Lisp based regular expression
|
|
;;; parser.
|
|
;;;
|
|
;;; This regular expression parser operates by taking a
|
|
;;; regular expression and breaking it down into a list
|
|
;;; consisting of lisp expressions and flags. The list
|
|
;;; of lisp expressions is then taken in turned into a
|
|
;;; lambda expression that can be later applied to a
|
|
;;; string argument for parsing.
|
|
;;;;
|
|
;;;; Modifications made 6 March 2001 By Chris Double (chris@double.co.nz)
|
|
;;;; to get working with Corman Lisp 1.42, add package statement and export
|
|
;;;; relevant functions.
|
|
;;;;
|
|
|
|
(in-package :cl-user)
|
|
|
|
;; Renamed to slime-nregex avoid name clashes with other versions of
|
|
;; this file. -- he
|
|
|
|
;;;; CND - 6/3/2001
|
|
(defpackage slime-nregex
|
|
(:use #:common-lisp)
|
|
(:export
|
|
#:regex
|
|
#:regex-compile
|
|
))
|
|
|
|
;;;; CND - 6/3/2001
|
|
(in-package :slime-nregex)
|
|
|
|
;;;
|
|
;;; First we create a copy of macros to help debug the beast
|
|
(eval-when (:compile-toplevel :load-toplevel :execute)
|
|
(defvar *regex-debug* nil) ; Set to nil for no debugging code
|
|
)
|
|
|
|
(defmacro info (message &rest args)
|
|
(if *regex-debug*
|
|
`(format *standard-output* ,message ,@args)))
|
|
|
|
;;;
|
|
;;; Declare the global variables for storing the paren index list.
|
|
;;;
|
|
(defvar *regex-groups* (make-array 10))
|
|
(defvar *regex-groupings* 0)
|
|
|
|
;;;
|
|
;;; Declare a simple interface for testing. You probably wouldn't want
|
|
;;; to use this interface unless you were just calling this once.
|
|
;;;
|
|
(defun regex (expression string)
|
|
"Usage: (regex <expression> <string)
|
|
This function will call regex-compile on the expression and then apply
|
|
the string to the returned lambda list."
|
|
(let ((findit (cond ((stringp expression)
|
|
(regex-compile expression))
|
|
((listp expression)
|
|
expression)))
|
|
(result nil))
|
|
(if (not (funcall (if (functionp findit)
|
|
findit
|
|
(eval `(function ,findit))) string))
|
|
(return-from regex nil))
|
|
(if (= *regex-groupings* 0)
|
|
(return-from regex t))
|
|
(dotimes (i *regex-groupings*)
|
|
(push (funcall 'subseq
|
|
string
|
|
(car (aref *regex-groups* i))
|
|
(cadr (aref *regex-groups* i)))
|
|
result))
|
|
(reverse result)))
|
|
|
|
;;;
|
|
;;; Declare some simple macros to make the code more readable.
|
|
;;;
|
|
(defvar *regex-special-chars* "?*+.()[]\\${}")
|
|
|
|
(defmacro add-exp (list)
|
|
"Add an item to the end of expression"
|
|
`(setf expression (append expression ,list)))
|
|
|
|
;;;
|
|
;;; Define a function that will take a quoted character and return
|
|
;;; what the real character should be plus how much of the source
|
|
;;; string was used. If the result is a set of characters, return an
|
|
;;; array of bits indicating which characters should be set. If the
|
|
;;; expression is one of the sub-group matches return a
|
|
;;; list-expression that will provide the match.
|
|
;;;
|
|
(defun regex-quoted (char-string &optional (invert nil))
|
|
"Usage: (regex-quoted <char-string> &optional invert)
|
|
Returns either the quoted character or a simple bit vector of bits set for
|
|
the matching values"
|
|
(let ((first (char char-string 0))
|
|
(result (char char-string 0))
|
|
(used-length 1))
|
|
(cond ((eql first #\n)
|
|
(setf result #\NewLine))
|
|
((eql first #\c)
|
|
(setf result #\Return))
|
|
((eql first #\t)
|
|
(setf result #\Tab))
|
|
((eql first #\d)
|
|
(setf result #*0000000000000000000000000000000000000000000000001111111111000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000))
|
|
((eql first #\D)
|
|
(setf result #*1111111111111111111111111111111111111111111111110000000000111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111))
|
|
((eql first #\w)
|
|
(setf result #*0000000000000000000000000000000000000000000000001111111111000000011111111111111111111111111000010111111111111111111111111110000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000))
|
|
((eql first #\W)
|
|
(setf result #*1111111111111111111111111111111111111111111111110000000000111111100000000000000000000000000111101000000000000000000000000001111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111))
|
|
((eql first #\b)
|
|
(setf result #*0000000001000000000000000000000011000000000010100000000000100000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000))
|
|
((eql first #\B)
|
|
(setf result #*1111111110111111111111111111111100111111111101011111111111011111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111))
|
|
((eql first #\s)
|
|
(setf result #*0000000001100000000000000000000010000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000))
|
|
((eql first #\S)
|
|
(setf result #*1111111110011111111111111111111101111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111))
|
|
((and (>= (char-code first) (char-code #\0))
|
|
(<= (char-code first) (char-code #\9)))
|
|
(if (and (> (length char-string) 2)
|
|
(and (>= (char-code (char char-string 1)) (char-code #\0))
|
|
(<= (char-code (char char-string 1)) (char-code #\9))
|
|
(>= (char-code (char char-string 2)) (char-code #\0))
|
|
(<= (char-code (char char-string 2)) (char-code #\9))))
|
|
;;
|
|
;; It is a single character specified in octal
|
|
;;
|
|
(progn
|
|
(setf result (do ((x 0 (1+ x))
|
|
(return 0))
|
|
((= x 2) return)
|
|
(setf return (+ (* return 8)
|
|
(- (char-code (char char-string x))
|
|
(char-code #\0))))))
|
|
(setf used-length 3))
|
|
;;
|
|
;; We have a group number replacement.
|
|
;;
|
|
(let ((group (- (char-code first) (char-code #\0))))
|
|
(setf result `((let ((nstring (subseq string (car (aref *regex-groups* ,group))
|
|
(cadr (aref *regex-groups* ,group)))))
|
|
(if (< length (+ index (length nstring)))
|
|
(return-from compare nil))
|
|
(if (not (string= string nstring
|
|
:start1 index
|
|
:end1 (+ index (length nstring))))
|
|
(return-from compare nil)
|
|
(incf index (length nstring)))))))))
|
|
(t
|
|
(setf result first)))
|
|
(if (and (vectorp result) invert)
|
|
(bit-xor result #*1111111110011111111111111111111101111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111 t))
|
|
(values result used-length)))
|
|
|
|
;;;
|
|
;;; Now for the main regex compiler routine.
|
|
;;;
|
|
(defun regex-compile (source &key (anchored nil))
|
|
"Usage: (regex-compile <expression> [ :anchored (t/nil) ])
|
|
This function take a regular expression (supplied as source) and
|
|
compiles this into a lambda list that a string argument can then
|
|
be applied to. It is also possible to compile this lambda list
|
|
for better performance or to save it as a named function for later
|
|
use"
|
|
(info "Now entering regex-compile with \"~A\"~%" source)
|
|
;;
|
|
;; This routine works in two parts.
|
|
;; The first pass take the regular expression and produces a list of
|
|
;; operators and lisp expressions for the entire regular expression.
|
|
;; The second pass takes this list and produces the lambda expression.
|
|
(let ((expression '()) ; holder for expressions
|
|
(group 1) ; Current group index
|
|
(group-stack nil) ; Stack of current group endings
|
|
(result nil) ; holder for built expression.
|
|
(fast-first nil)) ; holder for quick unanchored scan
|
|
;;
|
|
;; If the expression was an empty string then it alway
|
|
;; matches (so lets leave early)
|
|
;;
|
|
(if (= (length source) 0)
|
|
(return-from regex-compile
|
|
'(lambda (&rest args)
|
|
(declare (ignore args))
|
|
t)))
|
|
;;
|
|
;; If the first character is a caret then set the anchored
|
|
;; flags and remove if from the expression string.
|
|
;;
|
|
(cond ((eql (char source 0) #\^)
|
|
(setf source (subseq source 1))
|
|
(setf anchored t)))
|
|
;;
|
|
;; If the first sequence is .* then also set the anchored flags.
|
|
;; (This is purely for optimization, it will work without this).
|
|
;;
|
|
(if (>= (length source) 2)
|
|
(if (string= source ".*" :start1 0 :end1 2)
|
|
(setf anchored t)))
|
|
;;
|
|
;; Also, If this is not an anchored search and the first character is
|
|
;; a literal, then do a quick scan to see if it is even in the string.
|
|
;; If not then we can issue a quick nil,
|
|
;; otherwise we can start the search at the matching character to skip
|
|
;; the checks of the non-matching characters anyway.
|
|
;;
|
|
;; If I really wanted to speed up this section of code it would be
|
|
;; easy to recognize the case of a fairly long multi-character literal
|
|
;; and generate a Boyer-Moore search for the entire literal.
|
|
;;
|
|
;; I generate the code to do a loop because on CMU Lisp this is about
|
|
;; twice as fast a calling position.
|
|
;;
|
|
(if (and (not anchored)
|
|
(not (position (char source 0) *regex-special-chars*))
|
|
(not (and (> (length source) 1)
|
|
(position (char source 1) *regex-special-chars*))))
|
|
(setf fast-first `((if (not (dotimes (i length nil)
|
|
(if (eql (char string i)
|
|
,(char source 0))
|
|
(return (setf start i)))))
|
|
(return-from final-return nil)))))
|
|
;;
|
|
;; Generate the very first expression to save the starting index
|
|
;; so that group 0 will be the entire string matched always
|
|
;;
|
|
(add-exp '((setf (aref *regex-groups* 0)
|
|
(list index nil))))
|
|
;;
|
|
;; Loop over each character in the regular expression building the
|
|
;; expression list as we go.
|
|
;;
|
|
(do ((eindex 0 (1+ eindex)))
|
|
((= eindex (length source)))
|
|
(let ((current (char source eindex)))
|
|
(info "Now processing character ~A index = ~A~%" current eindex)
|
|
(case current
|
|
((#\.)
|
|
;;
|
|
;; Generate code for a single wild character
|
|
;;
|
|
(add-exp '((if (>= index length)
|
|
(return-from compare nil)
|
|
(incf index)))))
|
|
((#\$)
|
|
;;
|
|
;; If this is the last character of the expression then
|
|
;; anchor the end of the expression, otherwise let it slide
|
|
;; as a standard character (even though it should be quoted).
|
|
;;
|
|
(if (= eindex (1- (length source)))
|
|
(add-exp '((if (not (= index length))
|
|
(return-from compare nil))))
|
|
(add-exp '((if (not (and (< index length)
|
|
(eql (char string index) #\$)))
|
|
(return-from compare nil)
|
|
(incf index))))))
|
|
((#\*)
|
|
(add-exp '(ASTRISK)))
|
|
|
|
((#\+)
|
|
(add-exp '(PLUS)))
|
|
|
|
((#\?)
|
|
(add-exp '(QUESTION)))
|
|
|
|
((#\()
|
|
;;
|
|
;; Start a grouping.
|
|
;;
|
|
(incf group)
|
|
(push group group-stack)
|
|
(add-exp `((setf (aref *regex-groups* ,(1- group))
|
|
(list index nil))))
|
|
(add-exp `(,group)))
|
|
((#\))
|
|
;;
|
|
;; End a grouping
|
|
;;
|
|
(let ((group (pop group-stack)))
|
|
(add-exp `((setf (cadr (aref *regex-groups* ,(1- group)))
|
|
index)))
|
|
(add-exp `(,(- group)))))
|
|
((#\[)
|
|
;;
|
|
;; Start of a range operation.
|
|
;; Generate a bit-vector that has one bit per possible character
|
|
;; and then on each character or range, set the possible bits.
|
|
;;
|
|
;; If the first character is carat then invert the set.
|
|
(let* ((invert (eql (char source (1+ eindex)) #\^))
|
|
(bitstring (make-array 256 :element-type 'bit
|
|
:initial-element
|
|
(if invert 1 0)))
|
|
(set-char (if invert 0 1)))
|
|
(if invert (incf eindex))
|
|
(do ((x (1+ eindex) (1+ x)))
|
|
((eql (char source x) #\]) (setf eindex x))
|
|
(info "Building range with character ~A~%" (char source x))
|
|
(cond ((and (eql (char source (1+ x)) #\-)
|
|
(not (eql (char source (+ x 2)) #\])))
|
|
(if (>= (char-code (char source x))
|
|
(char-code (char source (+ 2 x))))
|
|
(error "Invalid range \"~A-~A\". Ranges must be in acending order"
|
|
(char source x) (char source (+ 2 x))))
|
|
(do ((j (char-code (char source x)) (1+ j)))
|
|
((> j (char-code (char source (+ 2 x))))
|
|
(incf x 2))
|
|
(info "Setting bit for char ~A code ~A~%" (code-char j) j)
|
|
(setf (sbit bitstring j) set-char)))
|
|
(t
|
|
(cond ((not (eql (char source x) #\]))
|
|
(let ((char (char source x)))
|
|
;;
|
|
;; If the character is quoted then find out what
|
|
;; it should have been
|
|
;;
|
|
(if (eql (char source x) #\\ )
|
|
(let ((length))
|
|
(multiple-value-setq (char length)
|
|
(regex-quoted (subseq source x) invert))
|
|
(incf x length)))
|
|
(info "Setting bit for char ~A code ~A~%" char (char-code char))
|
|
(if (not (vectorp char))
|
|
(setf (sbit bitstring (char-code (char source x))) set-char)
|
|
(bit-ior bitstring char t))))))))
|
|
(add-exp `((let ((range ,bitstring))
|
|
(if (>= index length)
|
|
(return-from compare nil))
|
|
(if (= 1 (sbit range (char-code (char string index))))
|
|
(incf index)
|
|
(return-from compare nil)))))))
|
|
((#\\ )
|
|
;;
|
|
;; Intreprete the next character as a special, range, octal, group or
|
|
;; just the character itself.
|
|
;;
|
|
(let ((length)
|
|
(value))
|
|
(multiple-value-setq (value length)
|
|
(regex-quoted (subseq source (1+ eindex)) nil))
|
|
(cond ((listp value)
|
|
(add-exp value))
|
|
((characterp value)
|
|
(add-exp `((if (not (and (< index length)
|
|
(eql (char string index)
|
|
,value)))
|
|
(return-from compare nil)
|
|
(incf index)))))
|
|
((vectorp value)
|
|
(add-exp `((let ((range ,value))
|
|
(if (>= index length)
|
|
(return-from compare nil))
|
|
(if (= 1 (sbit range (char-code (char string index))))
|
|
(incf index)
|
|
(return-from compare nil)))))))
|
|
(incf eindex length)))
|
|
(t
|
|
;;
|
|
;; We have a literal character.
|
|
;; Scan to see how many we have and if it is more than one
|
|
;; generate a string= verses as single eql.
|
|
;;
|
|
(let* ((lit "")
|
|
(term (dotimes (litindex (- (length source) eindex) nil)
|
|
(let ((litchar (char source (+ eindex litindex))))
|
|
(if (position litchar *regex-special-chars*)
|
|
(return litchar)
|
|
(progn
|
|
(info "Now adding ~A index ~A to lit~%" litchar
|
|
litindex)
|
|
(setf lit (concatenate 'string lit
|
|
(string litchar)))))))))
|
|
(if (= (length lit) 1)
|
|
(add-exp `((if (not (and (< index length)
|
|
(eql (char string index) ,current)))
|
|
(return-from compare nil)
|
|
(incf index))))
|
|
;;
|
|
;; If we have a multi-character literal then we must
|
|
;; check to see if the next character (if there is one)
|
|
;; is an astrisk or a plus or a question mark. If so then we must not use this
|
|
;; character in the big literal.
|
|
(progn
|
|
(if (or (eql term #\*)
|
|
(eql term #\+)
|
|
(eql term #\?))
|
|
(setf lit (subseq lit 0 (1- (length lit)))))
|
|
(add-exp `((if (< length (+ index ,(length lit)))
|
|
(return-from compare nil))
|
|
(if (not (string= string ,lit :start1 index
|
|
:end1 (+ index ,(length lit))))
|
|
(return-from compare nil)
|
|
(incf index ,(length lit)))))))
|
|
(incf eindex (1- (length lit))))))))
|
|
;;
|
|
;; Plug end of list to return t. If we made it this far then
|
|
;; We have matched!
|
|
(add-exp '((setf (cadr (aref *regex-groups* 0))
|
|
index)))
|
|
(add-exp '((return-from final-return t)))
|
|
;;
|
|
;;; (print expression)
|
|
;;
|
|
;; Now take the expression list and turn it into a lambda expression
|
|
;; replacing the special flags with lisp code.
|
|
;; For example: A BEGIN needs to be replace by an expression that
|
|
;; saves the current index, then evaluates everything till it gets to
|
|
;; the END then save the new index if it didn't fail.
|
|
;; On an ASTRISK I need to take the previous expression and wrap
|
|
;; it in a do that will evaluate the expression till an error
|
|
;; occurs and then another do that encompases the remainder of the
|
|
;; regular expression and iterates decrementing the index by one
|
|
;; of the matched expression sizes and then returns nil. After
|
|
;; the last expression insert a form that does a return t so that
|
|
;; if the entire nested sub-expression succeeds then the loop
|
|
;; is broken manually.
|
|
;;
|
|
(setf result (copy-tree nil))
|
|
;;
|
|
;; Reversing the current expression makes building up the
|
|
;; lambda list easier due to the nexting of expressions when
|
|
;; and astrisk has been encountered.
|
|
(setf expression (reverse expression))
|
|
(do ((elt 0 (1+ elt)))
|
|
((>= elt (length expression)))
|
|
(let ((piece (nth elt expression)))
|
|
;;
|
|
;; Now check for PLUS, if so then ditto the expression and then let the
|
|
;; ASTRISK below handle the rest.
|
|
;;
|
|
(cond ((eql piece 'PLUS)
|
|
(cond ((listp (nth (1+ elt) expression))
|
|
(setf result (append (list (nth (1+ elt) expression))
|
|
result)))
|
|
;;
|
|
;; duplicate the entire group
|
|
;; NOTE: This hasn't been implemented yet!!
|
|
(t
|
|
(error "GROUP repeat hasn't been implemented yet~%")))))
|
|
(cond ((listp piece) ;Just append the list
|
|
(setf result (append (list piece) result)))
|
|
((eql piece 'QUESTION) ; Wrap it in a block that won't fail
|
|
(cond ((listp (nth (1+ elt) expression))
|
|
(setf result
|
|
(append `((progn (block compare
|
|
,(nth (1+ elt)
|
|
expression))
|
|
t))
|
|
result))
|
|
(incf elt))
|
|
;;
|
|
;; This is a QUESTION on an entire group which
|
|
;; hasn't been implemented yet!!!
|
|
;;
|
|
(t
|
|
(error "Optional groups not implemented yet~%"))))
|
|
((or (eql piece 'ASTRISK) ; Do the wild thing!
|
|
(eql piece 'PLUS))
|
|
(cond ((listp (nth (1+ elt) expression))
|
|
;;
|
|
;; This is a single character wild card so
|
|
;; do the simple form.
|
|
;;
|
|
(setf result
|
|
`((let ((oindex index))
|
|
(block compare
|
|
(do ()
|
|
(nil)
|
|
,(nth (1+ elt) expression)))
|
|
(do ((start index (1- start)))
|
|
((< start oindex) nil)
|
|
(let ((index start))
|
|
(block compare
|
|
,@result))))))
|
|
(incf elt))
|
|
(t
|
|
;;
|
|
;; This is a subgroup repeated so I must build
|
|
;; the loop using several values.
|
|
;;
|
|
))
|
|
)
|
|
(t t)))) ; Just ignore everything else.
|
|
;;
|
|
;; Now wrap the result in a lambda list that can then be
|
|
;; invoked or compiled, however the user wishes.
|
|
;;
|
|
(if anchored
|
|
(setf result
|
|
`(lambda (string &key (start 0) (end (length string)))
|
|
(setf *regex-groupings* ,group)
|
|
(block final-return
|
|
(block compare
|
|
(let ((index start)
|
|
(length end))
|
|
,@result)))))
|
|
(setf result
|
|
`(lambda (string &key (start 0) (end (length string)))
|
|
(setf *regex-groupings* ,group)
|
|
(block final-return
|
|
(let ((length end))
|
|
,@fast-first
|
|
(do ((marker start (1+ marker)))
|
|
((> marker end) nil)
|
|
(let ((index marker))
|
|
(if (block compare
|
|
,@result)
|
|
(return t)))))))))))
|
|
|
|
;; (provide 'nregex)
|