summaryrefslogtreecommitdiffstats
path: root/crawl-ref/source/util/pcre/pcre_exec.c
diff options
context:
space:
mode:
Diffstat (limited to 'crawl-ref/source/util/pcre/pcre_exec.c')
-rw-r--r--crawl-ref/source/util/pcre/pcre_exec.c4953
1 files changed, 0 insertions, 4953 deletions
diff --git a/crawl-ref/source/util/pcre/pcre_exec.c b/crawl-ref/source/util/pcre/pcre_exec.c
deleted file mode 100644
index ed28ae7c97..0000000000
--- a/crawl-ref/source/util/pcre/pcre_exec.c
+++ /dev/null
@@ -1,4953 +0,0 @@
-/*************************************************
-* Perl-Compatible Regular Expressions *
-*************************************************/
-
-/* PCRE is a library of functions to support regular expressions whose syntax
-and semantics are as close as possible to those of the Perl 5 language.
-
- Written by Philip Hazel
- Copyright (c) 1997-2008 University of Cambridge
-
------------------------------------------------------------------------------
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
- * Redistributions of source code must retain the above copyright notice,
- this list of conditions and the following disclaimer.
-
- * Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in the
- documentation and/or other materials provided with the distribution.
-
- * Neither the name of the University of Cambridge nor the names of its
- contributors may be used to endorse or promote products derived from
- this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-POSSIBILITY OF SUCH DAMAGE.
------------------------------------------------------------------------------
-*/
-
-
-/* This module contains pcre_exec(), the externally visible function that does
-pattern matching using an NFA algorithm, trying to mimic Perl as closely as
-possible. There are also some static supporting functions. */
-
-#ifdef HAVE_CONFIG_H
-#include "config.h"
-#endif
-
-#define NLBLOCK md /* Block containing newline information */
-#define PSSTART start_subject /* Field containing processed string start */
-#define PSEND end_subject /* Field containing processed string end */
-
-#include "pcre_internal.h"
-
-/* Undefine some potentially clashing cpp symbols */
-
-#undef min
-#undef max
-
-/* Flag bits for the match() function */
-
-#define match_condassert 0x01 /* Called to check a condition assertion */
-#define match_cbegroup 0x02 /* Could-be-empty unlimited repeat group */
-
-/* Non-error returns from the match() function. Error returns are externally
-defined PCRE_ERROR_xxx codes, which are all negative. */
-
-#define MATCH_MATCH 1
-#define MATCH_NOMATCH 0
-
-/* Special internal returns from the match() function. Make them sufficiently
-negative to avoid the external error codes. */
-
-#define MATCH_COMMIT (-999)
-#define MATCH_PRUNE (-998)
-#define MATCH_SKIP (-997)
-#define MATCH_THEN (-996)
-
-/* Maximum number of ints of offset to save on the stack for recursive calls.
-If the offset vector is bigger, malloc is used. This should be a multiple of 3,
-because the offset vector is always a multiple of 3 long. */
-
-#define REC_STACK_SAVE_MAX 30
-
-/* Min and max values for the common repeats; for the maxima, 0 => infinity */
-
-static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
-static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
-
-
-
-#ifdef DEBUG
-/*************************************************
-* Debugging function to print chars *
-*************************************************/
-
-/* Print a sequence of chars in printable format, stopping at the end of the
-subject if the requested.
-
-Arguments:
- p points to characters
- length number to print
- is_subject TRUE if printing from within md->start_subject
- md pointer to matching data block, if is_subject is TRUE
-
-Returns: nothing
-*/
-
-static void
-pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
-{
-unsigned int c;
-if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
-while (length-- > 0)
- if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
-}
-#endif
-
-
-
-/*************************************************
-* Match a back-reference *
-*************************************************/
-
-/* If a back reference hasn't been set, the length that is passed is greater
-than the number of characters left in the string, so the match fails.
-
-Arguments:
- offset index into the offset vector
- eptr points into the subject
- length length to be matched
- md points to match data block
- ims the ims flags
-
-Returns: TRUE if matched
-*/
-
-static BOOL
-match_ref(int offset, register USPTR eptr, int length, match_data *md,
- unsigned long int ims)
-{
-USPTR p = md->start_subject + md->offset_vector[offset];
-
-#ifdef DEBUG
-if (eptr >= md->end_subject)
- printf("matching subject <null>");
-else
- {
- printf("matching subject ");
- pchars(eptr, length, TRUE, md);
- }
-printf(" against backref ");
-pchars(p, length, FALSE, md);
-printf("\n");
-#endif
-
-/* Always fail if not enough characters left */
-
-if (length > md->end_subject - eptr) return FALSE;
-
-/* Separate the caselesss case for speed */
-
-if ((ims & PCRE_CASELESS) != 0)
- {
- while (length-- > 0)
- if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE;
- }
-else
- { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
-
-return TRUE;
-}
-
-
-
-/***************************************************************************
-****************************************************************************
- RECURSION IN THE match() FUNCTION
-
-The match() function is highly recursive, though not every recursive call
-increases the recursive depth. Nevertheless, some regular expressions can cause
-it to recurse to a great depth. I was writing for Unix, so I just let it call
-itself recursively. This uses the stack for saving everything that has to be
-saved for a recursive call. On Unix, the stack can be large, and this works
-fine.
-
-It turns out that on some non-Unix-like systems there are problems with
-programs that use a lot of stack. (This despite the fact that every last chip
-has oodles of memory these days, and techniques for extending the stack have
-been known for decades.) So....
-
-There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
-calls by keeping local variables that need to be preserved in blocks of memory
-obtained from malloc() instead instead of on the stack. Macros are used to
-achieve this so that the actual code doesn't look very different to what it
-always used to.
-
-The original heap-recursive code used longjmp(). However, it seems that this
-can be very slow on some operating systems. Following a suggestion from Stan
-Switzer, the use of longjmp() has been abolished, at the cost of having to
-provide a unique number for each call to RMATCH. There is no way of generating
-a sequence of numbers at compile time in C. I have given them names, to make
-them stand out more clearly.
-
-Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
-FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
-tests. Furthermore, not using longjmp() means that local dynamic variables
-don't have indeterminate values; this has meant that the frame size can be
-reduced because the result can be "passed back" by straight setting of the
-variable instead of being passed in the frame.
-****************************************************************************
-***************************************************************************/
-
-/* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
-below must be updated in sync. */
-
-enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
- RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
- RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
- RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
- RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
- RM51, RM52, RM53, RM54 };
-
-/* These versions of the macros use the stack, as normal. There are debugging
-versions and production versions. Note that the "rw" argument of RMATCH isn't
-actuall used in this definition. */
-
-#ifndef NO_RECURSE
-#define REGISTER register
-
-#ifdef DEBUG
-#define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
- { \
- printf("match() called in line %d\n", __LINE__); \
- rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1); \
- printf("to line %d\n", __LINE__); \
- }
-#define RRETURN(ra) \
- { \
- printf("match() returned %d from line %d ", ra, __LINE__); \
- return ra; \
- }
-#else
-#define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
- rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1)
-#define RRETURN(ra) return ra
-#endif
-
-#else
-
-
-/* These versions of the macros manage a private stack on the heap. Note that
-the "rd" argument of RMATCH isn't actually used in this definition. It's the md
-argument of match(), which never changes. */
-
-#define REGISTER
-
-#define RMATCH(ra,rb,rc,rd,re,rf,rg,rw)\
- {\
- heapframe *newframe = (pcre_stack_malloc)(sizeof(heapframe));\
- frame->Xwhere = rw; \
- newframe->Xeptr = ra;\
- newframe->Xecode = rb;\
- newframe->Xmstart = mstart;\
- newframe->Xoffset_top = rc;\
- newframe->Xims = re;\
- newframe->Xeptrb = rf;\
- newframe->Xflags = rg;\
- newframe->Xrdepth = frame->Xrdepth + 1;\
- newframe->Xprevframe = frame;\
- frame = newframe;\
- DPRINTF(("restarting from line %d\n", __LINE__));\
- goto HEAP_RECURSE;\
- L_##rw:\
- DPRINTF(("jumped back to line %d\n", __LINE__));\
- }
-
-#define RRETURN(ra)\
- {\
- heapframe *newframe = frame;\
- frame = newframe->Xprevframe;\
- (pcre_stack_free)(newframe);\
- if (frame != NULL)\
- {\
- rrc = ra;\
- goto HEAP_RETURN;\
- }\
- return ra;\
- }
-
-
-/* Structure for remembering the local variables in a private frame */
-
-typedef struct heapframe {
- struct heapframe *Xprevframe;
-
- /* Function arguments that may change */
-
- const uschar *Xeptr;
- const uschar *Xecode;
- const uschar *Xmstart;
- int Xoffset_top;
- long int Xims;
- eptrblock *Xeptrb;
- int Xflags;
- unsigned int Xrdepth;
-
- /* Function local variables */
-
- const uschar *Xcallpat;
- const uschar *Xcharptr;
- const uschar *Xdata;
- const uschar *Xnext;
- const uschar *Xpp;
- const uschar *Xprev;
- const uschar *Xsaved_eptr;
-
- recursion_info Xnew_recursive;
-
- BOOL Xcur_is_word;
- BOOL Xcondition;
- BOOL Xprev_is_word;
-
- unsigned long int Xoriginal_ims;
-
-#ifdef SUPPORT_UCP
- int Xprop_type;
- int Xprop_value;
- int Xprop_fail_result;
- int Xprop_category;
- int Xprop_chartype;
- int Xprop_script;
- int Xoclength;
- uschar Xocchars[8];
-#endif
-
- int Xctype;
- unsigned int Xfc;
- int Xfi;
- int Xlength;
- int Xmax;
- int Xmin;
- int Xnumber;
- int Xoffset;
- int Xop;
- int Xsave_capture_last;
- int Xsave_offset1, Xsave_offset2, Xsave_offset3;
- int Xstacksave[REC_STACK_SAVE_MAX];
-
- eptrblock Xnewptrb;
-
- /* Where to jump back to */
-
- int Xwhere;
-
-} heapframe;
-
-#endif
-
-
-/***************************************************************************
-***************************************************************************/
-
-
-
-/*************************************************
-* Match from current position *
-*************************************************/
-
-/* This function is called recursively in many circumstances. Whenever it
-returns a negative (error) response, the outer incarnation must also return the
-same response.
-
-Performance note: It might be tempting to extract commonly used fields from the
-md structure (e.g. utf8, end_subject) into individual variables to improve
-performance. Tests using gcc on a SPARC disproved this; in the first case, it
-made performance worse.
-
-Arguments:
- eptr pointer to current character in subject
- ecode pointer to current position in compiled code
- mstart pointer to the current match start position (can be modified
- by encountering \K)
- offset_top current top pointer
- md pointer to "static" info for the match
- ims current /i, /m, and /s options
- eptrb pointer to chain of blocks containing eptr at start of
- brackets - for testing for empty matches
- flags can contain
- match_condassert - this is an assertion condition
- match_cbegroup - this is the start of an unlimited repeat
- group that can match an empty string
- rdepth the recursion depth
-
-Returns: MATCH_MATCH if matched ) these values are >= 0
- MATCH_NOMATCH if failed to match )
- a negative PCRE_ERROR_xxx value if aborted by an error condition
- (e.g. stopped by repeated call or recursion limit)
-*/
-
-static int
-match(REGISTER USPTR eptr, REGISTER const uschar *ecode, const uschar *mstart,
- int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
- int flags, unsigned int rdepth)
-{
-/* These variables do not need to be preserved over recursion in this function,
-so they can be ordinary variables in all cases. Mark some of them with
-"register" because they are used a lot in loops. */
-
-register int rrc; /* Returns from recursive calls */
-register int i; /* Used for loops not involving calls to RMATCH() */
-register unsigned int c; /* Character values not kept over RMATCH() calls */
-register BOOL utf8; /* Local copy of UTF-8 flag for speed */
-
-BOOL minimize, possessive; /* Quantifier options */
-
-/* When recursion is not being used, all "local" variables that have to be
-preserved over calls to RMATCH() are part of a "frame" which is obtained from
-heap storage. Set up the top-level frame here; others are obtained from the
-heap whenever RMATCH() does a "recursion". See the macro definitions above. */
-
-#ifdef NO_RECURSE
-heapframe *frame = (pcre_stack_malloc)(sizeof(heapframe));
-frame->Xprevframe = NULL; /* Marks the top level */
-
-/* Copy in the original argument variables */
-
-frame->Xeptr = eptr;
-frame->Xecode = ecode;
-frame->Xmstart = mstart;
-frame->Xoffset_top = offset_top;
-frame->Xims = ims;
-frame->Xeptrb = eptrb;
-frame->Xflags = flags;
-frame->Xrdepth = rdepth;
-
-/* This is where control jumps back to to effect "recursion" */
-
-HEAP_RECURSE:
-
-/* Macros make the argument variables come from the current frame */
-
-#define eptr frame->Xeptr
-#define ecode frame->Xecode
-#define mstart frame->Xmstart
-#define offset_top frame->Xoffset_top
-#define ims frame->Xims
-#define eptrb frame->Xeptrb
-#define flags frame->Xflags
-#define rdepth frame->Xrdepth
-
-/* Ditto for the local variables */
-
-#ifdef SUPPORT_UTF8
-#define charptr frame->Xcharptr
-#endif
-#define callpat frame->Xcallpat
-#define data frame->Xdata
-#define next frame->Xnext
-#define pp frame->Xpp
-#define prev frame->Xprev
-#define saved_eptr frame->Xsaved_eptr
-
-#define new_recursive frame->Xnew_recursive
-
-#define cur_is_word frame->Xcur_is_word
-#define condition frame->Xcondition
-#define prev_is_word frame->Xprev_is_word
-
-#define original_ims frame->Xoriginal_ims
-
-#ifdef SUPPORT_UCP
-#define prop_type frame->Xprop_type
-#define prop_value frame->Xprop_value
-#define prop_fail_result frame->Xprop_fail_result
-#define prop_category frame->Xprop_category
-#define prop_chartype frame->Xprop_chartype
-#define prop_script frame->Xprop_script
-#define oclength frame->Xoclength
-#define occhars frame->Xocchars
-#endif
-
-#define ctype frame->Xctype
-#define fc frame->Xfc
-#define fi frame->Xfi
-#define length frame->Xlength
-#define max frame->Xmax
-#define min frame->Xmin
-#define number frame->Xnumber
-#define offset frame->Xoffset
-#define op frame->Xop
-#define save_capture_last frame->Xsave_capture_last
-#define save_offset1 frame->Xsave_offset1
-#define save_offset2 frame->Xsave_offset2
-#define save_offset3 frame->Xsave_offset3
-#define stacksave frame->Xstacksave
-
-#define newptrb frame->Xnewptrb
-
-/* When recursion is being used, local variables are allocated on the stack and
-get preserved during recursion in the normal way. In this environment, fi and
-i, and fc and c, can be the same variables. */
-
-#else /* NO_RECURSE not defined */
-#define fi i
-#define fc c
-
-
-#ifdef SUPPORT_UTF8 /* Many of these variables are used only */
-const uschar *charptr; /* in small blocks of the code. My normal */
-#endif /* style of coding would have declared */
-const uschar *callpat; /* them within each of those blocks. */
-const uschar *data; /* However, in order to accommodate the */
-const uschar *next; /* version of this code that uses an */
-USPTR pp; /* external "stack" implemented on the */
-const uschar *prev; /* heap, it is easier to declare them all */
-USPTR saved_eptr; /* here, so the declarations can be cut */
- /* out in a block. The only declarations */
-recursion_info new_recursive; /* within blocks below are for variables */
- /* that do not have to be preserved over */
-BOOL cur_is_word; /* a recursive call to RMATCH(). */
-BOOL condition;
-BOOL prev_is_word;
-
-unsigned long int original_ims;
-
-#ifdef SUPPORT_UCP
-int prop_type;
-int prop_value;
-int prop_fail_result;
-int prop_category;
-int prop_chartype;
-int prop_script;
-int oclength;
-uschar occhars[8];
-#endif
-
-int ctype;
-int length;
-int max;
-int min;
-int number;
-int offset;
-int op;
-int save_capture_last;
-int save_offset1, save_offset2, save_offset3;
-int stacksave[REC_STACK_SAVE_MAX];
-
-eptrblock newptrb;
-#endif /* NO_RECURSE */
-
-/* These statements are here to stop the compiler complaining about unitialized
-variables. */
-
-#ifdef SUPPORT_UCP
-prop_value = 0;
-prop_fail_result = 0;
-#endif
-
-
-/* This label is used for tail recursion, which is used in a few cases even
-when NO_RECURSE is not defined, in order to reduce the amount of stack that is
-used. Thanks to Ian Taylor for noticing this possibility and sending the
-original patch. */
-
-TAIL_RECURSE:
-
-/* OK, now we can get on with the real code of the function. Recursive calls
-are specified by the macro RMATCH and RRETURN is used to return. When
-NO_RECURSE is *not* defined, these just turn into a recursive call to match()
-and a "return", respectively (possibly with some debugging if DEBUG is
-defined). However, RMATCH isn't like a function call because it's quite a
-complicated macro. It has to be used in one particular way. This shouldn't,
-however, impact performance when true recursion is being used. */
-
-#ifdef SUPPORT_UTF8
-utf8 = md->utf8; /* Local copy of the flag */
-#else
-utf8 = FALSE;
-#endif
-
-/* First check that we haven't called match() too many times, or that we
-haven't exceeded the recursive call limit. */
-
-if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
-if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
-
-original_ims = ims; /* Save for resetting on ')' */
-
-/* At the start of a group with an unlimited repeat that may match an empty
-string, the match_cbegroup flag is set. When this is the case, add the current
-subject pointer to the chain of such remembered pointers, to be checked when we
-hit the closing ket, in order to break infinite loops that match no characters.
-When match() is called in other circumstances, don't add to the chain. The
-match_cbegroup flag must NOT be used with tail recursion, because the memory
-block that is used is on the stack, so a new one may be required for each
-match(). */
-
-if ((flags & match_cbegroup) != 0)
- {
- newptrb.epb_saved_eptr = eptr;
- newptrb.epb_prev = eptrb;
- eptrb = &newptrb;
- }
-
-/* Now start processing the opcodes. */
-
-for (;;)
- {
- minimize = possessive = FALSE;
- op = *ecode;
-
- /* For partial matching, remember if we ever hit the end of the subject after
- matching at least one subject character. */
-
- if (md->partial &&
- eptr >= md->end_subject &&
- eptr > mstart)
- md->hitend = TRUE;
-
- switch(op)
- {
- case OP_FAIL:
- RRETURN(MATCH_NOMATCH);
-
- case OP_PRUNE:
- RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
- ims, eptrb, flags, RM51);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- RRETURN(MATCH_PRUNE);
-
- case OP_COMMIT:
- RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
- ims, eptrb, flags, RM52);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- RRETURN(MATCH_COMMIT);
-
- case OP_SKIP:
- RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
- ims, eptrb, flags, RM53);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- md->start_match_ptr = eptr; /* Pass back current position */
- RRETURN(MATCH_SKIP);
-
- case OP_THEN:
- RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
- ims, eptrb, flags, RM54);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- RRETURN(MATCH_THEN);
-
- /* Handle a capturing bracket. If there is space in the offset vector, save
- the current subject position in the working slot at the top of the vector.
- We mustn't change the current values of the data slot, because they may be
- set from a previous iteration of this group, and be referred to by a
- reference inside the group.
-
- If the bracket fails to match, we need to restore this value and also the
- values of the final offsets, in case they were set by a previous iteration
- of the same bracket.
-
- If there isn't enough space in the offset vector, treat this as if it were
- a non-capturing bracket. Don't worry about setting the flag for the error
- case here; that is handled in the code for KET. */
-
- case OP_CBRA:
- case OP_SCBRA:
- number = GET2(ecode, 1+LINK_SIZE);
- offset = number << 1;
-
-#ifdef DEBUG
- printf("start bracket %d\n", number);
- printf("subject=");
- pchars(eptr, 16, TRUE, md);
- printf("\n");
-#endif
-
- if (offset < md->offset_max)
- {
- save_offset1 = md->offset_vector[offset];
- save_offset2 = md->offset_vector[offset+1];
- save_offset3 = md->offset_vector[md->offset_end - number];
- save_capture_last = md->capture_last;
-
- DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
- md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
-
- flags = (op == OP_SCBRA)? match_cbegroup : 0;
- do
- {
- RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
- ims, eptrb, flags, RM1);
- if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
- md->capture_last = save_capture_last;
- ecode += GET(ecode, 1);
- }
- while (*ecode == OP_ALT);
-
- DPRINTF(("bracket %d failed\n", number));
-
- md->offset_vector[offset] = save_offset1;
- md->offset_vector[offset+1] = save_offset2;
- md->offset_vector[md->offset_end - number] = save_offset3;
-
- RRETURN(MATCH_NOMATCH);
- }
-
- /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
- as a non-capturing bracket. */
-
- /* VVVVVVVVVVVVVVVVVVVVVVVVV */
- /* VVVVVVVVVVVVVVVVVVVVVVVVV */
-
- DPRINTF(("insufficient capture room: treat as non-capturing\n"));
-
- /* VVVVVVVVVVVVVVVVVVVVVVVVV */
- /* VVVVVVVVVVVVVVVVVVVVVVVVV */
-
- /* Non-capturing bracket. Loop for all the alternatives. When we get to the
- final alternative within the brackets, we would return the result of a
- recursive call to match() whatever happened. We can reduce stack usage by
- turning this into a tail recursion, except in the case when match_cbegroup
- is set.*/
-
- case OP_BRA:
- case OP_SBRA:
- DPRINTF(("start non-capturing bracket\n"));
- flags = (op >= OP_SBRA)? match_cbegroup : 0;
- for (;;)
- {
- if (ecode[GET(ecode, 1)] != OP_ALT) /* Final alternative */
- {
- if (flags == 0) /* Not a possibly empty group */
- {
- ecode += _pcre_OP_lengths[*ecode];
- DPRINTF(("bracket 0 tail recursion\n"));
- goto TAIL_RECURSE;
- }
-
- /* Possibly empty group; can't use tail recursion. */
-
- RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
- eptrb, flags, RM48);
- RRETURN(rrc);
- }
-
- /* For non-final alternatives, continue the loop for a NOMATCH result;
- otherwise return. */
-
- RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
- eptrb, flags, RM2);
- if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
- ecode += GET(ecode, 1);
- }
- /* Control never reaches here. */
-
- /* Conditional group: compilation checked that there are no more than
- two branches. If the condition is false, skipping the first branch takes us
- past the end if there is only one branch, but that's OK because that is
- exactly what going to the ket would do. As there is only one branch to be
- obeyed, we can use tail recursion to avoid using another stack frame. */
-
- case OP_COND:
- case OP_SCOND:
- if (ecode[LINK_SIZE+1] == OP_RREF) /* Recursion test */
- {
- offset = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
- condition = md->recursive != NULL &&
- (offset == RREF_ANY || offset == md->recursive->group_num);
- ecode += condition? 3 : GET(ecode, 1);
- }
-
- else if (ecode[LINK_SIZE+1] == OP_CREF) /* Group used test */
- {
- offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
- condition = offset < offset_top && md->offset_vector[offset] >= 0;
- ecode += condition? 3 : GET(ecode, 1);
- }
-
- else if (ecode[LINK_SIZE+1] == OP_DEF) /* DEFINE - always false */
- {
- condition = FALSE;
- ecode += GET(ecode, 1);
- }
-
- /* The condition is an assertion. Call match() to evaluate it - setting
- the final argument match_condassert causes it to stop at the end of an
- assertion. */
-
- else
- {
- RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
- match_condassert, RM3);
- if (rrc == MATCH_MATCH)
- {
- condition = TRUE;
- ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
- while (*ecode == OP_ALT) ecode += GET(ecode, 1);
- }
- else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
- {
- RRETURN(rrc); /* Need braces because of following else */
- }
- else
- {
- condition = FALSE;
- ecode += GET(ecode, 1);
- }
- }
-
- /* We are now at the branch that is to be obeyed. As there is only one,
- we can use tail recursion to avoid using another stack frame, except when
- match_cbegroup is required for an unlimited repeat of a possibly empty
- group. If the second alternative doesn't exist, we can just plough on. */
-
- if (condition || *ecode == OP_ALT)
- {
- ecode += 1 + LINK_SIZE;
- if (op == OP_SCOND) /* Possibly empty group */
- {
- RMATCH(eptr, ecode, offset_top, md, ims, eptrb, match_cbegroup, RM49);
- RRETURN(rrc);
- }
- else /* Group must match something */
- {
- flags = 0;
- goto TAIL_RECURSE;
- }
- }
- else /* Condition false & no 2nd alternative */
- {
- ecode += 1 + LINK_SIZE;
- }
- break;
-
-
- /* End of the pattern, either real or forced. If we are in a top-level
- recursion, we should restore the offsets appropriately and continue from
- after the call. */
-
- case OP_ACCEPT:
- case OP_END:
- if (md->recursive != NULL && md->recursive->group_num == 0)
- {
- recursion_info *rec = md->recursive;
- DPRINTF(("End of pattern in a (?0) recursion\n"));
- md->recursive = rec->prevrec;
- memmove(md->offset_vector, rec->offset_save,
- rec->saved_max * sizeof(int));
- mstart = rec->save_start;
- ims = original_ims;
- ecode = rec->after_call;
- break;
- }
-
- /* Otherwise, if PCRE_NOTEMPTY is set, fail if we have matched an empty
- string - backtracking will then try other alternatives, if any. */
-
- if (md->notempty && eptr == mstart) RRETURN(MATCH_NOMATCH);
- md->end_match_ptr = eptr; /* Record where we ended */
- md->end_offset_top = offset_top; /* and how many extracts were taken */
- md->start_match_ptr = mstart; /* and the start (\K can modify) */
- RRETURN(MATCH_MATCH);
-
- /* Change option settings */
-
- case OP_OPT:
- ims = ecode[1];
- ecode += 2;
- DPRINTF(("ims set to %02lx\n", ims));
- break;
-
- /* Assertion brackets. Check the alternative branches in turn - the
- matching won't pass the KET for an assertion. If any one branch matches,
- the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
- start of each branch to move the current point backwards, so the code at
- this level is identical to the lookahead case. */
-
- case OP_ASSERT:
- case OP_ASSERTBACK:
- do
- {
- RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
- RM4);
- if (rrc == MATCH_MATCH) break;
- if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
- ecode += GET(ecode, 1);
- }
- while (*ecode == OP_ALT);
- if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
-
- /* If checking an assertion for a condition, return MATCH_MATCH. */
-
- if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
-
- /* Continue from after the assertion, updating the offsets high water
- mark, since extracts may have been taken during the assertion. */
-
- do ecode += GET(ecode,1); while (*ecode == OP_ALT);
- ecode += 1 + LINK_SIZE;
- offset_top = md->end_offset_top;
- continue;
-
- /* Negative assertion: all branches must fail to match */
-
- case OP_ASSERT_NOT:
- case OP_ASSERTBACK_NOT:
- do
- {
- RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
- RM5);
- if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH);
- if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
- ecode += GET(ecode,1);
- }
- while (*ecode == OP_ALT);
-
- if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
-
- ecode += 1 + LINK_SIZE;
- continue;
-
- /* Move the subject pointer back. This occurs only at the start of
- each branch of a lookbehind assertion. If we are too close to the start to
- move back, this match function fails. When working with UTF-8 we move
- back a number of characters, not bytes. */
-
- case OP_REVERSE:
-#ifdef SUPPORT_UTF8
- if (utf8)
- {
- i = GET(ecode, 1);
- while (i-- > 0)
- {
- eptr--;
- if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
- BACKCHAR(eptr);
- }
- }
- else
-#endif
-
- /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
-
- {
- eptr -= GET(ecode, 1);
- if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
- }
-
- /* Skip to next op code */
-
- ecode += 1 + LINK_SIZE;
- break;
-
- /* The callout item calls an external function, if one is provided, passing
- details of the match so far. This is mainly for debugging, though the
- function is able to force a failure. */
-
- case OP_CALLOUT:
- if (pcre_callout != NULL)
- {
- pcre_callout_block cb;
- cb.version = 1; /* Version 1 of the callout block */
- cb.callout_number = ecode[1];
- cb.offset_vector = md->offset_vector;
- cb.subject = (PCRE_SPTR)md->start_subject;
- cb.subject_length = md->end_subject - md->start_subject;
- cb.start_match = mstart - md->start_subject;
- cb.current_position = eptr - md->start_subject;
- cb.pattern_position = GET(ecode, 2);
- cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
- cb.capture_top = offset_top/2;
- cb.capture_last = md->capture_last;
- cb.callout_data = md->callout_data;
- if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
- if (rrc < 0) RRETURN(rrc);
- }
- ecode += 2 + 2*LINK_SIZE;
- break;
-
- /* Recursion either matches the current regex, or some subexpression. The
- offset data is the offset to the starting bracket from the start of the
- whole pattern. (This is so that it works from duplicated subpatterns.)
-
- If there are any capturing brackets started but not finished, we have to
- save their starting points and reinstate them after the recursion. However,
- we don't know how many such there are (offset_top records the completed
- total) so we just have to save all the potential data. There may be up to
- 65535 such values, which is too large to put on the stack, but using malloc
- for small numbers seems expensive. As a compromise, the stack is used when
- there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
- is used. A problem is what to do if the malloc fails ... there is no way of
- returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
- values on the stack, and accept that the rest may be wrong.
-
- There are also other values that have to be saved. We use a chained
- sequence of blocks that actually live on the stack. Thanks to Robin Houston
- for the original version of this logic. */
-
- case OP_RECURSE:
- {
- callpat = md->start_code + GET(ecode, 1);
- new_recursive.group_num = (callpat == md->start_code)? 0 :
- GET2(callpat, 1 + LINK_SIZE);
-
- /* Add to "recursing stack" */
-
- new_recursive.prevrec = md->recursive;
- md->recursive = &new_recursive;
-
- /* Find where to continue from afterwards */
-
- ecode += 1 + LINK_SIZE;
- new_recursive.after_call = ecode;
-
- /* Now save the offset data. */
-
- new_recursive.saved_max = md->offset_end;
- if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
- new_recursive.offset_save = stacksave;
- else
- {
- new_recursive.offset_save =
- (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
- if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
- }
-
- memcpy(new_recursive.offset_save, md->offset_vector,
- new_recursive.saved_max * sizeof(int));
- new_recursive.save_start = mstart;
- mstart = eptr;
-
- /* OK, now we can do the recursion. For each top-level alternative we
- restore the offset and recursion data. */
-
- DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
- flags = (*callpat >= OP_SBRA)? match_cbegroup : 0;
- do
- {
- RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
- md, ims, eptrb, flags, RM6);
- if (rrc == MATCH_MATCH)
- {
- DPRINTF(("Recursion matched\n"));
- md->recursive = new_recursive.prevrec;
- if (new_recursive.offset_save != stacksave)
- (pcre_free)(new_recursive.offset_save);
- RRETURN(MATCH_MATCH);
- }
- else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
- {
- DPRINTF(("Recursion gave error %d\n", rrc));
- RRETURN(rrc);
- }
-
- md->recursive = &new_recursive;
- memcpy(md->offset_vector, new_recursive.offset_save,
- new_recursive.saved_max * sizeof(int));
- callpat += GET(callpat, 1);
- }
- while (*callpat == OP_ALT);
-
- DPRINTF(("Recursion didn't match\n"));
- md->recursive = new_recursive.prevrec;
- if (new_recursive.offset_save != stacksave)
- (pcre_free)(new_recursive.offset_save);
- RRETURN(MATCH_NOMATCH);
- }
- /* Control never reaches here */
-
- /* "Once" brackets are like assertion brackets except that after a match,
- the point in the subject string is not moved back. Thus there can never be
- a move back into the brackets. Friedl calls these "atomic" subpatterns.
- Check the alternative branches in turn - the matching won't pass the KET
- for this kind of subpattern. If any one branch matches, we carry on as at
- the end of a normal bracket, leaving the subject pointer. */
-
- case OP_ONCE:
- prev = ecode;
- saved_eptr = eptr;
-
- do
- {
- RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM7);
- if (rrc == MATCH_MATCH) break;
- if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
- ecode += GET(ecode,1);
- }
- while (*ecode == OP_ALT);
-
- /* If hit the end of the group (which could be repeated), fail */
-
- if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
-
- /* Continue as from after the assertion, updating the offsets high water
- mark, since extracts may have been taken. */
-
- do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
-
- offset_top = md->end_offset_top;
- eptr = md->end_match_ptr;
-
- /* For a non-repeating ket, just continue at this level. This also
- happens for a repeating ket if no characters were matched in the group.
- This is the forcible breaking of infinite loops as implemented in Perl
- 5.005. If there is an options reset, it will get obeyed in the normal
- course of events. */
-
- if (*ecode == OP_KET || eptr == saved_eptr)
- {
- ecode += 1+LINK_SIZE;
- break;
- }
-
- /* The repeating kets try the rest of the pattern or restart from the
- preceding bracket, in the appropriate order. The second "call" of match()
- uses tail recursion, to avoid using another stack frame. We need to reset
- any options that changed within the bracket before re-running it, so
- check the next opcode. */
-
- if (ecode[1+LINK_SIZE] == OP_OPT)
- {
- ims = (ims & ~PCRE_IMS) | ecode[4];
- DPRINTF(("ims set to %02lx at group repeat\n", ims));
- }
-
- if (*ecode == OP_KETRMIN)
- {
- RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM8);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- ecode = prev;
- flags = 0;
- goto TAIL_RECURSE;
- }
- else /* OP_KETRMAX */
- {
- RMATCH(eptr, prev, offset_top, md, ims, eptrb, match_cbegroup, RM9);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- ecode += 1 + LINK_SIZE;
- flags = 0;
- goto TAIL_RECURSE;
- }
- /* Control never gets here */
-
- /* An alternation is the end of a branch; scan along to find the end of the
- bracketed group and go to there. */
-
- case OP_ALT:
- do ecode += GET(ecode,1); while (*ecode == OP_ALT);
- break;
-
- /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
- indicating that it may occur zero times. It may repeat infinitely, or not
- at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
- with fixed upper repeat limits are compiled as a number of copies, with the
- optional ones preceded by BRAZERO or BRAMINZERO. */
-
- case OP_BRAZERO:
- {
- next = ecode+1;
- RMATCH(eptr, next, offset_top, md, ims, eptrb, 0, RM10);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- do next += GET(next,1); while (*next == OP_ALT);
- ecode = next + 1 + LINK_SIZE;
- }
- break;
-
- case OP_BRAMINZERO:
- {
- next = ecode+1;
- do next += GET(next, 1); while (*next == OP_ALT);
- RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0, RM11);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- ecode++;
- }
- break;
-
- case OP_SKIPZERO:
- {
- next = ecode+1;
- do next += GET(next,1); while (*next == OP_ALT);
- ecode = next + 1 + LINK_SIZE;
- }
- break;
-
- /* End of a group, repeated or non-repeating. */
-
- case OP_KET:
- case OP_KETRMIN:
- case OP_KETRMAX:
- prev = ecode - GET(ecode, 1);
-
- /* If this was a group that remembered the subject start, in order to break
- infinite repeats of empty string matches, retrieve the subject start from
- the chain. Otherwise, set it NULL. */
-
- if (*prev >= OP_SBRA)
- {
- saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
- eptrb = eptrb->epb_prev; /* Backup to previous group */
- }
- else saved_eptr = NULL;
-
- /* If we are at the end of an assertion group, stop matching and return
- MATCH_MATCH, but record the current high water mark for use by positive
- assertions. Do this also for the "once" (atomic) groups. */
-
- if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
- *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
- *prev == OP_ONCE)
- {
- md->end_match_ptr = eptr; /* For ONCE */
- md->end_offset_top = offset_top;
- RRETURN(MATCH_MATCH);
- }
-
- /* For capturing groups we have to check the group number back at the start
- and if necessary complete handling an extraction by setting the offsets and
- bumping the high water mark. Note that whole-pattern recursion is coded as
- a recurse into group 0, so it won't be picked up here. Instead, we catch it
- when the OP_END is reached. Other recursion is handled here. */
-
- if (*prev == OP_CBRA || *prev == OP_SCBRA)
- {
- number = GET2(prev, 1+LINK_SIZE);
- offset = number << 1;
-
-#ifdef DEBUG
- printf("end bracket %d", number);
- printf("\n");
-#endif
-
- md->capture_last = number;
- if (offset >= md->offset_max) md->offset_overflow = TRUE; else
- {
- md->offset_vector[offset] =
- md->offset_vector[md->offset_end - number];
- md->offset_vector[offset+1] = eptr - md->start_subject;
- if (offset_top <= offset) offset_top = offset + 2;
- }
-
- /* Handle a recursively called group. Restore the offsets
- appropriately and continue from after the call. */
-
- if (md->recursive != NULL && md->recursive->group_num == number)
- {
- recursion_info *rec = md->recursive;
- DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
- md->recursive = rec->prevrec;
- mstart = rec->save_start;
- memcpy(md->offset_vector, rec->offset_save,
- rec->saved_max * sizeof(int));
- ecode = rec->after_call;
- ims = original_ims;
- break;
- }
- }
-
- /* For both capturing and non-capturing groups, reset the value of the ims
- flags, in case they got changed during the group. */
-
- ims = original_ims;
- DPRINTF(("ims reset to %02lx\n", ims));
-
- /* For a non-repeating ket, just continue at this level. This also
- happens for a repeating ket if no characters were matched in the group.
- This is the forcible breaking of infinite loops as implemented in Perl
- 5.005. If there is an options reset, it will get obeyed in the normal
- course of events. */
-
- if (*ecode == OP_KET || eptr == saved_eptr)
- {
- ecode += 1 + LINK_SIZE;
- break;
- }
-
- /* The repeating kets try the rest of the pattern or restart from the
- preceding bracket, in the appropriate order. In the second case, we can use
- tail recursion to avoid using another stack frame, unless we have an
- unlimited repeat of a group that can match an empty string. */
-
- flags = (*prev >= OP_SBRA)? match_cbegroup : 0;
-
- if (*ecode == OP_KETRMIN)
- {
- RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM12);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (flags != 0) /* Could match an empty string */
- {
- RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM50);
- RRETURN(rrc);
- }
- ecode = prev;
- goto TAIL_RECURSE;
- }
- else /* OP_KETRMAX */
- {
- RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM13);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- ecode += 1 + LINK_SIZE;
- flags = 0;
- goto TAIL_RECURSE;
- }
- /* Control never gets here */
-
- /* Start of subject unless notbol, or after internal newline if multiline */
-
- case OP_CIRC:
- if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
- if ((ims & PCRE_MULTILINE) != 0)
- {
- if (eptr != md->start_subject &&
- (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
- RRETURN(MATCH_NOMATCH);
- ecode++;
- break;
- }
- /* ... else fall through */
-
- /* Start of subject assertion */
-
- case OP_SOD:
- if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
- ecode++;
- break;
-
- /* Start of match assertion */
-
- case OP_SOM:
- if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
- ecode++;
- break;
-
- /* Reset the start of match point */
-
- case OP_SET_SOM:
- mstart = eptr;
- ecode++;
- break;
-
- /* Assert before internal newline if multiline, or before a terminating
- newline unless endonly is set, else end of subject unless noteol is set. */
-
- case OP_DOLL:
- if ((ims & PCRE_MULTILINE) != 0)
- {
- if (eptr < md->end_subject)
- { if (!IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); }
- else
- { if (md->noteol) RRETURN(MATCH_NOMATCH); }
- ecode++;
- break;
- }
- else
- {
- if (md->noteol) RRETURN(MATCH_NOMATCH);
- if (!md->endonly)
- {
- if (eptr != md->end_subject &&
- (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
- RRETURN(MATCH_NOMATCH);
- ecode++;
- break;
- }
- }
- /* ... else fall through for endonly */
-
- /* End of subject assertion (\z) */
-
- case OP_EOD:
- if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
- ecode++;
- break;
-
- /* End of subject or ending \n assertion (\Z) */
-
- case OP_EODN:
- if (eptr != md->end_subject &&
- (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
- RRETURN(MATCH_NOMATCH);
- ecode++;
- break;
-
- /* Word boundary assertions */
-
- case OP_NOT_WORD_BOUNDARY:
- case OP_WORD_BOUNDARY:
- {
-
- /* Find out if the previous and current characters are "word" characters.
- It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
- be "non-word" characters. */
-
-#ifdef SUPPORT_UTF8
- if (utf8)
- {
- if (eptr == md->start_subject) prev_is_word = FALSE; else
- {
- const uschar *lastptr = eptr - 1;
- while((*lastptr & 0xc0) == 0x80) lastptr--;
- GETCHAR(c, lastptr);
- prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
- }
- if (eptr >= md->end_subject) cur_is_word = FALSE; else
- {
- GETCHAR(c, eptr);
- cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
- }
- }
- else
-#endif
-
- /* More streamlined when not in UTF-8 mode */
-
- {
- prev_is_word = (eptr != md->start_subject) &&
- ((md->ctypes[eptr[-1]] & ctype_word) != 0);
- cur_is_word = (eptr < md->end_subject) &&
- ((md->ctypes[*eptr] & ctype_word) != 0);
- }
-
- /* Now see if the situation is what we want */
-
- if ((*ecode++ == OP_WORD_BOUNDARY)?
- cur_is_word == prev_is_word : cur_is_word != prev_is_word)
- RRETURN(MATCH_NOMATCH);
- }
- break;
-
- /* Match a single character type; inline for speed */
-
- case OP_ANY:
- if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
- /* Fall through */
-
- case OP_ALLANY:
- if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
- if (utf8) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
- ecode++;
- break;
-
- /* Match a single byte, even in UTF-8 mode. This opcode really does match
- any byte, even newline, independent of the setting of PCRE_DOTALL. */
-
- case OP_ANYBYTE:
- if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
- ecode++;
- break;
-
- case OP_NOT_DIGIT:
- if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- GETCHARINCTEST(c, eptr);
- if (
-#ifdef SUPPORT_UTF8
- c < 256 &&
-#endif
- (md->ctypes[c] & ctype_digit) != 0
- )
- RRETURN(MATCH_NOMATCH);
- ecode++;
- break;
-
- case OP_DIGIT:
- if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- GETCHARINCTEST(c, eptr);
- if (
-#ifdef SUPPORT_UTF8
- c >= 256 ||
-#endif
- (md->ctypes[c] & ctype_digit) == 0
- )
- RRETURN(MATCH_NOMATCH);
- ecode++;
- break;
-
- case OP_NOT_WHITESPACE:
- if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- GETCHARINCTEST(c, eptr);
- if (
-#ifdef SUPPORT_UTF8
- c < 256 &&
-#endif
- (md->ctypes[c] & ctype_space) != 0
- )
- RRETURN(MATCH_NOMATCH);
- ecode++;
- break;
-
- case OP_WHITESPACE:
- if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- GETCHARINCTEST(c, eptr);
- if (
-#ifdef SUPPORT_UTF8
- c >= 256 ||
-#endif
- (md->ctypes[c] & ctype_space) == 0
- )
- RRETURN(MATCH_NOMATCH);
- ecode++;
- break;
-
- case OP_NOT_WORDCHAR:
- if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- GETCHARINCTEST(c, eptr);
- if (
-#ifdef SUPPORT_UTF8
- c < 256 &&
-#endif
- (md->ctypes[c] & ctype_word) != 0
- )
- RRETURN(MATCH_NOMATCH);
- ecode++;
- break;
-
- case OP_WORDCHAR:
- if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- GETCHARINCTEST(c, eptr);
- if (
-#ifdef SUPPORT_UTF8
- c >= 256 ||
-#endif
- (md->ctypes[c] & ctype_word) == 0
- )
- RRETURN(MATCH_NOMATCH);
- ecode++;
- break;
-
- case OP_ANYNL:
- if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- GETCHARINCTEST(c, eptr);
- switch(c)
- {
- default: RRETURN(MATCH_NOMATCH);
- case 0x000d:
- if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
- break;
-
- case 0x000a:
- break;
-
- case 0x000b:
- case 0x000c:
- case 0x0085:
- case 0x2028:
- case 0x2029:
- if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
- break;
- }
- ecode++;
- break;
-
- case OP_NOT_HSPACE:
- if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- GETCHARINCTEST(c, eptr);
- switch(c)
- {
- default: break;
- case 0x09: /* HT */
- case 0x20: /* SPACE */
- case 0xa0: /* NBSP */
- case 0x1680: /* OGHAM SPACE MARK */
- case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
- case 0x2000: /* EN QUAD */
- case 0x2001: /* EM QUAD */
- case 0x2002: /* EN SPACE */
- case 0x2003: /* EM SPACE */
- case 0x2004: /* THREE-PER-EM SPACE */
- case 0x2005: /* FOUR-PER-EM SPACE */
- case 0x2006: /* SIX-PER-EM SPACE */
- case 0x2007: /* FIGURE SPACE */
- case 0x2008: /* PUNCTUATION SPACE */
- case 0x2009: /* THIN SPACE */
- case 0x200A: /* HAIR SPACE */
- case 0x202f: /* NARROW NO-BREAK SPACE */
- case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
- case 0x3000: /* IDEOGRAPHIC SPACE */
- RRETURN(MATCH_NOMATCH);
- }
- ecode++;
- break;
-
- case OP_HSPACE:
- if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- GETCHARINCTEST(c, eptr);
- switch(c)
- {
- default: RRETURN(MATCH_NOMATCH);
- case 0x09: /* HT */
- case 0x20: /* SPACE */
- case 0xa0: /* NBSP */
- case 0x1680: /* OGHAM SPACE MARK */
- case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
- case 0x2000: /* EN QUAD */
- case 0x2001: /* EM QUAD */
- case 0x2002: /* EN SPACE */
- case 0x2003: /* EM SPACE */
- case 0x2004: /* THREE-PER-EM SPACE */
- case 0x2005: /* FOUR-PER-EM SPACE */
- case 0x2006: /* SIX-PER-EM SPACE */
- case 0x2007: /* FIGURE SPACE */
- case 0x2008: /* PUNCTUATION SPACE */
- case 0x2009: /* THIN SPACE */
- case 0x200A: /* HAIR SPACE */
- case 0x202f: /* NARROW NO-BREAK SPACE */
- case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
- case 0x3000: /* IDEOGRAPHIC SPACE */
- break;
- }
- ecode++;
- break;
-
- case OP_NOT_VSPACE:
- if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- GETCHARINCTEST(c, eptr);
- switch(c)
- {
- default: break;
- case 0x0a: /* LF */
- case 0x0b: /* VT */
- case 0x0c: /* FF */
- case 0x0d: /* CR */
- case 0x85: /* NEL */
- case 0x2028: /* LINE SEPARATOR */
- case 0x2029: /* PARAGRAPH SEPARATOR */
- RRETURN(MATCH_NOMATCH);
- }
- ecode++;
- break;
-
- case OP_VSPACE:
- if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- GETCHARINCTEST(c, eptr);
- switch(c)
- {
- default: RRETURN(MATCH_NOMATCH);
- case 0x0a: /* LF */
- case 0x0b: /* VT */
- case 0x0c: /* FF */
- case 0x0d: /* CR */
- case 0x85: /* NEL */
- case 0x2028: /* LINE SEPARATOR */
- case 0x2029: /* PARAGRAPH SEPARATOR */
- break;
- }
- ecode++;
- break;
-
-#ifdef SUPPORT_UCP
- /* Check the next character by Unicode property. We will get here only
- if the support is in the binary; otherwise a compile-time error occurs. */
-
- case OP_PROP:
- case OP_NOTPROP:
- if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- GETCHARINCTEST(c, eptr);
- {
- int chartype, script;
- int category = _pcre_ucp_findprop(c, &chartype, &script);
-
- switch(ecode[1])
- {
- case PT_ANY:
- if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
- break;
-
- case PT_LAMP:
- if ((chartype == ucp_Lu ||
- chartype == ucp_Ll ||
- chartype == ucp_Lt) == (op == OP_NOTPROP))
- RRETURN(MATCH_NOMATCH);
- break;
-
- case PT_GC:
- if ((ecode[2] != category) == (op == OP_PROP))
- RRETURN(MATCH_NOMATCH);
- break;
-
- case PT_PC:
- if ((ecode[2] != chartype) == (op == OP_PROP))
- RRETURN(MATCH_NOMATCH);
- break;
-
- case PT_SC:
- if ((ecode[2] != script) == (op == OP_PROP))
- RRETURN(MATCH_NOMATCH);
- break;
-
- default:
- RRETURN(PCRE_ERROR_INTERNAL);
- }
-
- ecode += 3;
- }
- break;
-
- /* Match an extended Unicode sequence. We will get here only if the support
- is in the binary; otherwise a compile-time error occurs. */
-
- case OP_EXTUNI:
- if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- GETCHARINCTEST(c, eptr);
- {
- int chartype, script;
- int category = _pcre_ucp_findprop(c, &chartype, &script);
- if (category == ucp_M) RRETURN(MATCH_NOMATCH);
- while (eptr < md->end_subject)
- {
- int len = 1;
- if (!utf8) c = *eptr; else
- {
- GETCHARLEN(c, eptr, len);
- }
- category = _pcre_ucp_findprop(c, &chartype, &script);
- if (category != ucp_M) break;
- eptr += len;
- }
- }
- ecode++;
- break;
-#endif
-
-
- /* Match a back reference, possibly repeatedly. Look past the end of the
- item to see if there is repeat information following. The code is similar
- to that for character classes, but repeated for efficiency. Then obey
- similar code to character type repeats - written out again for speed.
- However, if the referenced string is the empty string, always treat
- it as matched, any number of times (otherwise there could be infinite
- loops). */
-
- case OP_REF:
- {
- offset = GET2(ecode, 1) << 1; /* Doubled ref number */
- ecode += 3;
-
- /* If the reference is unset, there are two possibilities:
-
- (a) In the default, Perl-compatible state, set the length to be longer
- than the amount of subject left; this ensures that every attempt at a
- match fails. We can't just fail here, because of the possibility of
- quantifiers with zero minima.
-
- (b) If the JavaScript compatibility flag is set, set the length to zero
- so that the back reference matches an empty string.
-
- Otherwise, set the length to the length of what was matched by the
- referenced subpattern. */
-
- if (offset >= offset_top || md->offset_vector[offset] < 0)
- length = (md->jscript_compat)? 0 : md->end_subject - eptr + 1;
- else
- length = md->offset_vector[offset+1] - md->offset_vector[offset];
-
- /* Set up for repetition, or handle the non-repeated case */
-
- switch (*ecode)
- {
- case OP_CRSTAR:
- case OP_CRMINSTAR:
- case OP_CRPLUS:
- case OP_CRMINPLUS:
- case OP_CRQUERY:
- case OP_CRMINQUERY:
- c = *ecode++ - OP_CRSTAR;
- minimize = (c & 1) != 0;
- min = rep_min[c]; /* Pick up values from tables; */
- max = rep_max[c]; /* zero for max => infinity */
- if (max == 0) max = INT_MAX;
- break;
-
- case OP_CRRANGE:
- case OP_CRMINRANGE:
- minimize = (*ecode == OP_CRMINRANGE);
- min = GET2(ecode, 1);
- max = GET2(ecode, 3);
- if (max == 0) max = INT_MAX;
- ecode += 5;
- break;
-
- default: /* No repeat follows */
- if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
- eptr += length;
- continue; /* With the main loop */
- }
-
- /* If the length of the reference is zero, just continue with the
- main loop. */
-
- if (length == 0) continue;
-
- /* First, ensure the minimum number of matches are present. We get back
- the length of the reference string explicitly rather than passing the
- address of eptr, so that eptr can be a register variable. */
-
- for (i = 1; i <= min; i++)
- {
- if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
- eptr += length;
- }
-
- /* If min = max, continue at the same level without recursion.
- They are not both allowed to be zero. */
-
- if (min == max) continue;
-
- /* If minimizing, keep trying and advancing the pointer */
-
- if (minimize)
- {
- for (fi = min;; fi++)
- {
- RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM14);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (fi >= max || !match_ref(offset, eptr, length, md, ims))
- RRETURN(MATCH_NOMATCH);
- eptr += length;
- }
- /* Control never gets here */
- }
-
- /* If maximizing, find the longest string and work backwards */
-
- else
- {
- pp = eptr;
- for (i = min; i < max; i++)
- {
- if (!match_ref(offset, eptr, length, md, ims)) break;
- eptr += length;
- }
- while (eptr >= pp)
- {
- RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM15);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- eptr -= length;
- }
- RRETURN(MATCH_NOMATCH);
- }
- }
- /* Control never gets here */
-
-
-
- /* Match a bit-mapped character class, possibly repeatedly. This op code is
- used when all the characters in the class have values in the range 0-255,
- and either the matching is caseful, or the characters are in the range
- 0-127 when UTF-8 processing is enabled. The only difference between
- OP_CLASS and OP_NCLASS occurs when a data character outside the range is
- encountered.
-
- First, look past the end of the item to see if there is repeat information
- following. Then obey similar code to character type repeats - written out
- again for speed. */
-
- case OP_NCLASS:
- case OP_CLASS:
- {
- data = ecode + 1; /* Save for matching */
- ecode += 33; /* Advance past the item */
-
- switch (*ecode)
- {
- case OP_CRSTAR:
- case OP_CRMINSTAR:
- case OP_CRPLUS:
- case OP_CRMINPLUS:
- case OP_CRQUERY:
- case OP_CRMINQUERY:
- c = *ecode++ - OP_CRSTAR;
- minimize = (c & 1) != 0;
- min = rep_min[c]; /* Pick up values from tables; */
- max = rep_max[c]; /* zero for max => infinity */
- if (max == 0) max = INT_MAX;
- break;
-
- case OP_CRRANGE:
- case OP_CRMINRANGE:
- minimize = (*ecode == OP_CRMINRANGE);
- min = GET2(ecode, 1);
- max = GET2(ecode, 3);
- if (max == 0) max = INT_MAX;
- ecode += 5;
- break;
-
- default: /* No repeat follows */
- min = max = 1;
- break;
- }
-
- /* First, ensure the minimum number of matches are present. */
-
-#ifdef SUPPORT_UTF8
- /* UTF-8 mode */
- if (utf8)
- {
- for (i = 1; i <= min; i++)
- {
- if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- GETCHARINC(c, eptr);
- if (c > 255)
- {
- if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
- }
- else
- {
- if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
- }
- }
- }
- else
-#endif
- /* Not UTF-8 mode */
- {
- for (i = 1; i <= min; i++)
- {
- if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- c = *eptr++;
- if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
- }
- }
-
- /* If max == min we can continue with the main loop without the
- need to recurse. */
-
- if (min == max) continue;
-
- /* If minimizing, keep testing the rest of the expression and advancing
- the pointer while it matches the class. */
-
- if (minimize)
- {
-#ifdef SUPPORT_UTF8
- /* UTF-8 mode */
- if (utf8)
- {
- for (fi = min;; fi++)
- {
- RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM16);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- GETCHARINC(c, eptr);
- if (c > 255)
- {
- if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
- }
- else
- {
- if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
- }
- }
- }
- else
-#endif
- /* Not UTF-8 mode */
- {
- for (fi = min;; fi++)
- {
- RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM17);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- c = *eptr++;
- if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
- }
- }
- /* Control never gets here */
- }
-
- /* If maximizing, find the longest possible run, then work backwards. */
-
- else
- {
- pp = eptr;
-
-#ifdef SUPPORT_UTF8
- /* UTF-8 mode */
- if (utf8)
- {
- for (i = min; i < max; i++)
- {
- int len = 1;
- if (eptr >= md->end_subject) break;
- GETCHARLEN(c, eptr, len);
- if (c > 255)
- {
- if (op == OP_CLASS) break;
- }
- else
- {
- if ((data[c/8] & (1 << (c&7))) == 0) break;
- }
- eptr += len;
- }
- for (;;)
- {
- RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM18);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (eptr-- == pp) break; /* Stop if tried at original pos */
- BACKCHAR(eptr);
- }
- }
- else
-#endif
- /* Not UTF-8 mode */
- {
- for (i = min; i < max; i++)
- {
- if (eptr >= md->end_subject) break;
- c = *eptr;
- if ((data[c/8] & (1 << (c&7))) == 0) break;
- eptr++;
- }
- while (eptr >= pp)
- {
- RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM19);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- eptr--;
- }
- }
-
- RRETURN(MATCH_NOMATCH);
- }
- }
- /* Control never gets here */
-
-
- /* Match an extended character class. This opcode is encountered only
- in UTF-8 mode, because that's the only time it is compiled. */
-
-#ifdef SUPPORT_UTF8
- case OP_XCLASS:
- {
- data = ecode + 1 + LINK_SIZE; /* Save for matching */
- ecode += GET(ecode, 1); /* Advance past the item */
-
- switch (*ecode)
- {
- case OP_CRSTAR:
- case OP_CRMINSTAR:
- case OP_CRPLUS:
- case OP_CRMINPLUS:
- case OP_CRQUERY:
- case OP_CRMINQUERY:
- c = *ecode++ - OP_CRSTAR;
- minimize = (c & 1) != 0;
- min = rep_min[c]; /* Pick up values from tables; */
- max = rep_max[c]; /* zero for max => infinity */
- if (max == 0) max = INT_MAX;
- break;
-
- case OP_CRRANGE:
- case OP_CRMINRANGE:
- minimize = (*ecode == OP_CRMINRANGE);
- min = GET2(ecode, 1);
- max = GET2(ecode, 3);
- if (max == 0) max = INT_MAX;
- ecode += 5;
- break;
-
- default: /* No repeat follows */
- min = max = 1;
- break;
- }
-
- /* First, ensure the minimum number of matches are present. */
-
- for (i = 1; i <= min; i++)
- {
- if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- GETCHARINC(c, eptr);
- if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
- }
-
- /* If max == min we can continue with the main loop without the
- need to recurse. */
-
- if (min == max) continue;
-
- /* If minimizing, keep testing the rest of the expression and advancing
- the pointer while it matches the class. */
-
- if (minimize)
- {
- for (fi = min;; fi++)
- {
- RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM20);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- GETCHARINC(c, eptr);
- if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
- }
- /* Control never gets here */
- }
-
- /* If maximizing, find the longest possible run, then work backwards. */
-
- else
- {
- pp = eptr;
- for (i = min; i < max; i++)
- {
- int len = 1;
- if (eptr >= md->end_subject) break;
- GETCHARLEN(c, eptr, len);
- if (!_pcre_xclass(c, data)) break;
- eptr += len;
- }
- for(;;)
- {
- RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM21);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (eptr-- == pp) break; /* Stop if tried at original pos */
- if (utf8) BACKCHAR(eptr);
- }
- RRETURN(MATCH_NOMATCH);
- }
-
- /* Control never gets here */
- }
-#endif /* End of XCLASS */
-
- /* Match a single character, casefully */
-
- case OP_CHAR:
-#ifdef SUPPORT_UTF8
- if (utf8)
- {
- length = 1;
- ecode++;
- GETCHARLEN(fc, ecode, length);
- if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
- while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
- }
- else
-#endif
-
- /* Non-UTF-8 mode */
- {
- if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
- if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
- ecode += 2;
- }
- break;
-
- /* Match a single character, caselessly */
-
- case OP_CHARNC:
-#ifdef SUPPORT_UTF8
- if (utf8)
- {
- length = 1;
- ecode++;
- GETCHARLEN(fc, ecode, length);
-
- if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
-
- /* If the pattern character's value is < 128, we have only one byte, and
- can use the fast lookup table. */
-
- if (fc < 128)
- {
- if (md->lcc[*ecode++] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
- }
-
- /* Otherwise we must pick up the subject character */
-
- else
- {
- unsigned int dc;
- GETCHARINC(dc, eptr);
- ecode += length;
-
- /* If we have Unicode property support, we can use it to test the other
- case of the character, if there is one. */
-
- if (fc != dc)
- {
-#ifdef SUPPORT_UCP
- if (dc != _pcre_ucp_othercase(fc))
-#endif
- RRETURN(MATCH_NOMATCH);
- }
- }
- }
- else
-#endif /* SUPPORT_UTF8 */
-
- /* Non-UTF-8 mode */
- {
- if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
- if (md->lcc[ecode[1]] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
- ecode += 2;
- }
- break;
-
- /* Match a single character repeatedly. */
-
- case OP_EXACT:
- min = max = GET2(ecode, 1);
- ecode += 3;
- goto REPEATCHAR;
-
- case OP_POSUPTO:
- possessive = TRUE;
- /* Fall through */
-
- case OP_UPTO:
- case OP_MINUPTO:
- min = 0;
- max = GET2(ecode, 1);
- minimize = *ecode == OP_MINUPTO;
- ecode += 3;
- goto REPEATCHAR;
-
- case OP_POSSTAR:
- possessive = TRUE;
- min = 0;
- max = INT_MAX;
- ecode++;
- goto REPEATCHAR;
-
- case OP_POSPLUS:
- possessive = TRUE;
- min = 1;
- max = INT_MAX;
- ecode++;
- goto REPEATCHAR;
-
- case OP_POSQUERY:
- possessive = TRUE;
- min = 0;
- max = 1;
- ecode++;
- goto REPEATCHAR;
-
- case OP_STAR:
- case OP_MINSTAR:
- case OP_PLUS:
- case OP_MINPLUS:
- case OP_QUERY:
- case OP_MINQUERY:
- c = *ecode++ - OP_STAR;
- minimize = (c & 1) != 0;
- min = rep_min[c]; /* Pick up values from tables; */
- max = rep_max[c]; /* zero for max => infinity */
- if (max == 0) max = INT_MAX;
-
- /* Common code for all repeated single-character matches. We can give
- up quickly if there are fewer than the minimum number of characters left in
- the subject. */
-
- REPEATCHAR:
-#ifdef SUPPORT_UTF8
- if (utf8)
- {
- length = 1;
- charptr = ecode;
- GETCHARLEN(fc, ecode, length);
- if (min * length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
- ecode += length;
-
- /* Handle multibyte character matching specially here. There is
- support for caseless matching if UCP support is present. */
-
- if (length > 1)
- {
-#ifdef SUPPORT_UCP
- unsigned int othercase;
- if ((ims & PCRE_CASELESS) != 0 &&
- (othercase = _pcre_ucp_othercase(fc)) != NOTACHAR)
- oclength = _pcre_ord2utf8(othercase, occhars);
- else oclength = 0;
-#endif /* SUPPORT_UCP */
-
- for (i = 1; i <= min; i++)
- {
- if (memcmp(eptr, charptr, length) == 0) eptr += length;
-#ifdef SUPPORT_UCP
- /* Need braces because of following else */
- else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
- else
- {
- if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
- eptr += oclength;
- }
-#else /* without SUPPORT_UCP */
- else { RRETURN(MATCH_NOMATCH); }
-#endif /* SUPPORT_UCP */
- }
-
- if (min == max) continue;
-
- if (minimize)
- {
- for (fi = min;; fi++)
- {
- RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM22);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- if (memcmp(eptr, charptr, length) == 0) eptr += length;
-#ifdef SUPPORT_UCP
- /* Need braces because of following else */
- else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
- else
- {
- if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
- eptr += oclength;
- }
-#else /* without SUPPORT_UCP */
- else { RRETURN (MATCH_NOMATCH); }
-#endif /* SUPPORT_UCP */
- }
- /* Control never gets here */
- }
-
- else /* Maximize */
- {
- pp = eptr;
- for (i = min; i < max; i++)
- {
- if (eptr > md->end_subject - length) break;
- if (memcmp(eptr, charptr, length) == 0) eptr += length;
-#ifdef SUPPORT_UCP
- else if (oclength == 0) break;
- else
- {
- if (memcmp(eptr, occhars, oclength) != 0) break;
- eptr += oclength;
- }
-#else /* without SUPPORT_UCP */
- else break;
-#endif /* SUPPORT_UCP */
- }
-
- if (possessive) continue;
- for(;;)
- {
- RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM23);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (eptr == pp) RRETURN(MATCH_NOMATCH);
-#ifdef SUPPORT_UCP
- eptr--;
- BACKCHAR(eptr);
-#else /* without SUPPORT_UCP */
- eptr -= length;
-#endif /* SUPPORT_UCP */
- }
- }
- /* Control never gets here */
- }
-
- /* If the length of a UTF-8 character is 1, we fall through here, and
- obey the code as for non-UTF-8 characters below, though in this case the
- value of fc will always be < 128. */
- }
- else
-#endif /* SUPPORT_UTF8 */
-
- /* When not in UTF-8 mode, load a single-byte character. */
- {
- if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
- fc = *ecode++;
- }
-
- /* The value of fc at this point is always less than 256, though we may or
- may not be in UTF-8 mode. The code is duplicated for the caseless and
- caseful cases, for speed, since matching characters is likely to be quite
- common. First, ensure the minimum number of matches are present. If min =
- max, continue at the same level without recursing. Otherwise, if
- minimizing, keep trying the rest of the expression and advancing one
- matching character if failing, up to the maximum. Alternatively, if
- maximizing, find the maximum number of characters and work backwards. */
-
- DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
- max, eptr));
-
- if ((ims & PCRE_CASELESS) != 0)
- {
- fc = md->lcc[fc];
- for (i = 1; i <= min; i++)
- if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
- if (min == max) continue;
- if (minimize)
- {
- for (fi = min;; fi++)
- {
- RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM24);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (fi >= max || eptr >= md->end_subject ||
- fc != md->lcc[*eptr++])
- RRETURN(MATCH_NOMATCH);
- }
- /* Control never gets here */
- }
- else /* Maximize */
- {
- pp = eptr;
- for (i = min; i < max; i++)
- {
- if (eptr >= md->end_subject || fc != md->lcc[*eptr]) break;
- eptr++;
- }
- if (possessive) continue;
- while (eptr >= pp)
- {
- RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM25);
- eptr--;
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- }
- RRETURN(MATCH_NOMATCH);
- }
- /* Control never gets here */
- }
-
- /* Caseful comparisons (includes all multi-byte characters) */
-
- else
- {
- for (i = 1; i <= min; i++) if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
- if (min == max) continue;
- if (minimize)
- {
- for (fi = min;; fi++)
- {
- RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM26);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (fi >= max || eptr >= md->end_subject || fc != *eptr++)
- RRETURN(MATCH_NOMATCH);
- }
- /* Control never gets here */
- }
- else /* Maximize */
- {
- pp = eptr;
- for (i = min; i < max; i++)
- {
- if (eptr >= md->end_subject || fc != *eptr) break;
- eptr++;
- }
- if (possessive) continue;
- while (eptr >= pp)
- {
- RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM27);
- eptr--;
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- }
- RRETURN(MATCH_NOMATCH);
- }
- }
- /* Control never gets here */
-
- /* Match a negated single one-byte character. The character we are
- checking can be multibyte. */
-
- case OP_NOT:
- if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- ecode++;
- GETCHARINCTEST(c, eptr);
- if ((ims & PCRE_CASELESS) != 0)
- {
-#ifdef SUPPORT_UTF8
- if (c < 256)
-#endif
- c = md->lcc[c];
- if (md->lcc[*ecode++] == c) RRETURN(MATCH_NOMATCH);
- }
- else
- {
- if (*ecode++ == c) RRETURN(MATCH_NOMATCH);
- }
- break;
-
- /* Match a negated single one-byte character repeatedly. This is almost a
- repeat of the code for a repeated single character, but I haven't found a
- nice way of commoning these up that doesn't require a test of the
- positive/negative option for each character match. Maybe that wouldn't add
- very much to the time taken, but character matching *is* what this is all
- about... */
-
- case OP_NOTEXACT:
- min = max = GET2(ecode, 1);
- ecode += 3;
- goto REPEATNOTCHAR;
-
- case OP_NOTUPTO:
- case OP_NOTMINUPTO:
- min = 0;
- max = GET2(ecode, 1);
- minimize = *ecode == OP_NOTMINUPTO;
- ecode += 3;
- goto REPEATNOTCHAR;
-
- case OP_NOTPOSSTAR:
- possessive = TRUE;
- min = 0;
- max = INT_MAX;
- ecode++;
- goto REPEATNOTCHAR;
-
- case OP_NOTPOSPLUS:
- possessive = TRUE;
- min = 1;
- max = INT_MAX;
- ecode++;
- goto REPEATNOTCHAR;
-
- case OP_NOTPOSQUERY:
- possessive = TRUE;
- min = 0;
- max = 1;
- ecode++;
- goto REPEATNOTCHAR;
-
- case OP_NOTPOSUPTO:
- possessive = TRUE;
- min = 0;
- max = GET2(ecode, 1);
- ecode += 3;
- goto REPEATNOTCHAR;
-
- case OP_NOTSTAR:
- case OP_NOTMINSTAR:
- case OP_NOTPLUS:
- case OP_NOTMINPLUS:
- case OP_NOTQUERY:
- case OP_NOTMINQUERY:
- c = *ecode++ - OP_NOTSTAR;
- minimize = (c & 1) != 0;
- min = rep_min[c]; /* Pick up values from tables; */
- max = rep_max[c]; /* zero for max => infinity */
- if (max == 0) max = INT_MAX;
-
- /* Common code for all repeated single-byte matches. We can give up quickly
- if there are fewer than the minimum number of bytes left in the
- subject. */
-
- REPEATNOTCHAR:
- if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
- fc = *ecode++;
-
- /* The code is duplicated for the caseless and caseful cases, for speed,
- since matching characters is likely to be quite common. First, ensure the
- minimum number of matches are present. If min = max, continue at the same
- level without recursing. Otherwise, if minimizing, keep trying the rest of
- the expression and advancing one matching character if failing, up to the
- maximum. Alternatively, if maximizing, find the maximum number of
- characters and work backwards. */
-
- DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
- max, eptr));
-
- if ((ims & PCRE_CASELESS) != 0)
- {
- fc = md->lcc[fc];
-
-#ifdef SUPPORT_UTF8
- /* UTF-8 mode */
- if (utf8)
- {
- register unsigned int d;
- for (i = 1; i <= min; i++)
- {
- GETCHARINC(d, eptr);
- if (d < 256) d = md->lcc[d];
- if (fc == d) RRETURN(MATCH_NOMATCH);
- }
- }
- else
-#endif
-
- /* Not UTF-8 mode */
- {
- for (i = 1; i <= min; i++)
- if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
- }
-
- if (min == max) continue;
-
- if (minimize)
- {
-#ifdef SUPPORT_UTF8
- /* UTF-8 mode */
- if (utf8)
- {
- register unsigned int d;
- for (fi = min;; fi++)
- {
- RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM28);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- GETCHARINC(d, eptr);
- if (d < 256) d = md->lcc[d];
- if (fi >= max || eptr >= md->end_subject || fc == d)
- RRETURN(MATCH_NOMATCH);
- }
- }
- else
-#endif
- /* Not UTF-8 mode */
- {
- for (fi = min;; fi++)
- {
- RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM29);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (fi >= max || eptr >= md->end_subject || fc == md->lcc[*eptr++])
- RRETURN(MATCH_NOMATCH);
- }
- }
- /* Control never gets here */
- }
-
- /* Maximize case */
-
- else
- {
- pp = eptr;
-
-#ifdef SUPPORT_UTF8
- /* UTF-8 mode */
- if (utf8)
- {
- register unsigned int d;
- for (i = min; i < max; i++)
- {
- int len = 1;
- if (eptr >= md->end_subject) break;
- GETCHARLEN(d, eptr, len);
- if (d < 256) d = md->lcc[d];
- if (fc == d) break;
- eptr += len;
- }
- if (possessive) continue;
- for(;;)
- {
- RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM30);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (eptr-- == pp) break; /* Stop if tried at original pos */
- BACKCHAR(eptr);
- }
- }
- else
-#endif
- /* Not UTF-8 mode */
- {
- for (i = min; i < max; i++)
- {
- if (eptr >= md->end_subject || fc == md->lcc[*eptr]) break;
- eptr++;
- }
- if (possessive) continue;
- while (eptr >= pp)
- {
- RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM31);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- eptr--;
- }
- }
-
- RRETURN(MATCH_NOMATCH);
- }
- /* Control never gets here */
- }
-
- /* Caseful comparisons */
-
- else
- {
-#ifdef SUPPORT_UTF8
- /* UTF-8 mode */
- if (utf8)
- {
- register unsigned int d;
- for (i = 1; i <= min; i++)
- {
- GETCHARINC(d, eptr);
- if (fc == d) RRETURN(MATCH_NOMATCH);
- }
- }
- else
-#endif
- /* Not UTF-8 mode */
- {
- for (i = 1; i <= min; i++)
- if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
- }
-
- if (min == max) continue;
-
- if (minimize)
- {
-#ifdef SUPPORT_UTF8
- /* UTF-8 mode */
- if (utf8)
- {
- register unsigned int d;
- for (fi = min;; fi++)
- {
- RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM32);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- GETCHARINC(d, eptr);
- if (fi >= max || eptr >= md->end_subject || fc == d)
- RRETURN(MATCH_NOMATCH);
- }
- }
- else
-#endif
- /* Not UTF-8 mode */
- {
- for (fi = min;; fi++)
- {
- RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM33);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (fi >= max || eptr >= md->end_subject || fc == *eptr++)
- RRETURN(MATCH_NOMATCH);
- }
- }
- /* Control never gets here */
- }
-
- /* Maximize case */
-
- else
- {
- pp = eptr;
-
-#ifdef SUPPORT_UTF8
- /* UTF-8 mode */
- if (utf8)
- {
- register unsigned int d;
- for (i = min; i < max; i++)
- {
- int len = 1;
- if (eptr >= md->end_subject) break;
- GETCHARLEN(d, eptr, len);
- if (fc == d) break;
- eptr += len;
- }
- if (possessive) continue;
- for(;;)
- {
- RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM34);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (eptr-- == pp) break; /* Stop if tried at original pos */
- BACKCHAR(eptr);
- }
- }
- else
-#endif
- /* Not UTF-8 mode */
- {
- for (i = min; i < max; i++)
- {
- if (eptr >= md->end_subject || fc == *eptr) break;
- eptr++;
- }
- if (possessive) continue;
- while (eptr >= pp)
- {
- RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM35);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- eptr--;
- }
- }
-
- RRETURN(MATCH_NOMATCH);
- }
- }
- /* Control never gets here */
-
- /* Match a single character type repeatedly; several different opcodes
- share code. This is very similar to the code for single characters, but we
- repeat it in the interests of efficiency. */
-
- case OP_TYPEEXACT:
- min = max = GET2(ecode, 1);
- minimize = TRUE;
- ecode += 3;
- goto REPEATTYPE;
-
- case OP_TYPEUPTO:
- case OP_TYPEMINUPTO:
- min = 0;
- max = GET2(ecode, 1);
- minimize = *ecode == OP_TYPEMINUPTO;
- ecode += 3;
- goto REPEATTYPE;
-
- case OP_TYPEPOSSTAR:
- possessive = TRUE;
- min = 0;
- max = INT_MAX;
- ecode++;
- goto REPEATTYPE;
-
- case OP_TYPEPOSPLUS:
- possessive = TRUE;
- min = 1;
- max = INT_MAX;
- ecode++;
- goto REPEATTYPE;
-
- case OP_TYPEPOSQUERY:
- possessive = TRUE;
- min = 0;
- max = 1;
- ecode++;
- goto REPEATTYPE;
-
- case OP_TYPEPOSUPTO:
- possessive = TRUE;
- min = 0;
- max = GET2(ecode, 1);
- ecode += 3;
- goto REPEATTYPE;
-
- case OP_TYPESTAR:
- case OP_TYPEMINSTAR:
- case OP_TYPEPLUS:
- case OP_TYPEMINPLUS:
- case OP_TYPEQUERY:
- case OP_TYPEMINQUERY:
- c = *ecode++ - OP_TYPESTAR;
- minimize = (c & 1) != 0;
- min = rep_min[c]; /* Pick up values from tables; */
- max = rep_max[c]; /* zero for max => infinity */
- if (max == 0) max = INT_MAX;
-
- /* Common code for all repeated single character type matches. Note that
- in UTF-8 mode, '.' matches a character of any length, but for the other
- character types, the valid characters are all one-byte long. */
-
- REPEATTYPE:
- ctype = *ecode++; /* Code for the character type */
-
-#ifdef SUPPORT_UCP
- if (ctype == OP_PROP || ctype == OP_NOTPROP)
- {
- prop_fail_result = ctype == OP_NOTPROP;
- prop_type = *ecode++;
- prop_value = *ecode++;
- }
- else prop_type = -1;
-#endif
-
- /* First, ensure the minimum number of matches are present. Use inline
- code for maximizing the speed, and do the type test once at the start
- (i.e. keep it out of the loop). Also we can test that there are at least
- the minimum number of bytes before we start. This isn't as effective in
- UTF-8 mode, but it does no harm. Separate the UTF-8 code completely as that
- is tidier. Also separate the UCP code, which can be the same for both UTF-8
- and single-bytes. */
-
- if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
- if (min > 0)
- {
-#ifdef SUPPORT_UCP
- if (prop_type >= 0)
- {
- switch(prop_type)
- {
- case PT_ANY:
- if (prop_fail_result) RRETURN(MATCH_NOMATCH);
- for (i = 1; i <= min; i++)
- {
- if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- GETCHARINCTEST(c, eptr);
- }
- break;
-
- case PT_LAMP:
- for (i = 1; i <= min; i++)
- {
- if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- GETCHARINCTEST(c, eptr);
- prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
- if ((prop_chartype == ucp_Lu ||
- prop_chartype == ucp_Ll ||
- prop_chartype == ucp_Lt) == prop_fail_result)
- RRETURN(MATCH_NOMATCH);
- }
- break;
-
- case PT_GC:
- for (i = 1; i <= min; i++)
- {
- if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- GETCHARINCTEST(c, eptr);
- prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
- if ((prop_category == prop_value) == prop_fail_result)
- RRETURN(MATCH_NOMATCH);
- }
- break;
-
- case PT_PC:
- for (i = 1; i <= min; i++)
- {
- if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- GETCHARINCTEST(c, eptr);
- prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
- if ((prop_chartype == prop_value) == prop_fail_result)
- RRETURN(MATCH_NOMATCH);
- }
- break;
-
- case PT_SC:
- for (i = 1; i <= min; i++)
- {
- if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- GETCHARINCTEST(c, eptr);
- prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
- if ((prop_script == prop_value) == prop_fail_result)
- RRETURN(MATCH_NOMATCH);
- }
- break;
-
- default:
- RRETURN(PCRE_ERROR_INTERNAL);
- }
- }
-
- /* Match extended Unicode sequences. We will get here only if the
- support is in the binary; otherwise a compile-time error occurs. */
-
- else if (ctype == OP_EXTUNI)
- {
- for (i = 1; i <= min; i++)
- {
- GETCHARINCTEST(c, eptr);
- prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
- if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
- while (eptr < md->end_subject)
- {
- int len = 1;
- if (!utf8) c = *eptr; else
- {
- GETCHARLEN(c, eptr, len);
- }
- prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
- if (prop_category != ucp_M) break;
- eptr += len;
- }
- }
- }
-
- else
-#endif /* SUPPORT_UCP */
-
-/* Handle all other cases when the coding is UTF-8 */
-
-#ifdef SUPPORT_UTF8
- if (utf8) switch(ctype)
- {
- case OP_ANY:
- for (i = 1; i <= min; i++)
- {
- if (eptr >= md->end_subject || IS_NEWLINE(eptr))
- RRETURN(MATCH_NOMATCH);
- eptr++;
- while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
- }
- break;
-
- case OP_ALLANY:
- for (i = 1; i <= min; i++)
- {
- if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- eptr++;
- while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
- }
- break;
-
- case OP_ANYBYTE:
- eptr += min;
- break;
-
- case OP_ANYNL:
- for (i = 1; i <= min; i++)
- {
- if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- GETCHARINC(c, eptr);
- switch(c)
- {
- default: RRETURN(MATCH_NOMATCH);
- case 0x000d:
- if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
- break;
-
- case 0x000a:
- break;
-
- case 0x000b:
- case 0x000c:
- case 0x0085:
- case 0x2028:
- case 0x2029:
- if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
- break;
- }
- }
- break;
-
- case OP_NOT_HSPACE:
- for (i = 1; i <= min; i++)
- {
- if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- GETCHARINC(c, eptr);
- switch(c)
- {
- default: break;
- case 0x09: /* HT */
- case 0x20: /* SPACE */
- case 0xa0: /* NBSP */
- case 0x1680: /* OGHAM SPACE MARK */
- case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
- case 0x2000: /* EN QUAD */
- case 0x2001: /* EM QUAD */
- case 0x2002: /* EN SPACE */
- case 0x2003: /* EM SPACE */
- case 0x2004: /* THREE-PER-EM SPACE */
- case 0x2005: /* FOUR-PER-EM SPACE */
- case 0x2006: /* SIX-PER-EM SPACE */
- case 0x2007: /* FIGURE SPACE */
- case 0x2008: /* PUNCTUATION SPACE */
- case 0x2009: /* THIN SPACE */
- case 0x200A: /* HAIR SPACE */
- case 0x202f: /* NARROW NO-BREAK SPACE */
- case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
- case 0x3000: /* IDEOGRAPHIC SPACE */
- RRETURN(MATCH_NOMATCH);
- }
- }
- break;
-
- case OP_HSPACE:
- for (i = 1; i <= min; i++)
- {
- if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- GETCHARINC(c, eptr);
- switch(c)
- {
- default: RRETURN(MATCH_NOMATCH);
- case 0x09: /* HT */
- case 0x20: /* SPACE */
- case 0xa0: /* NBSP */
- case 0x1680: /* OGHAM SPACE MARK */
- case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
- case 0x2000: /* EN QUAD */
- case 0x2001: /* EM QUAD */
- case 0x2002: /* EN SPACE */
- case 0x2003: /* EM SPACE */
- case 0x2004: /* THREE-PER-EM SPACE */
- case 0x2005: /* FOUR-PER-EM SPACE */
- case 0x2006: /* SIX-PER-EM SPACE */
- case 0x2007: /* FIGURE SPACE */
- case 0x2008: /* PUNCTUATION SPACE */
- case 0x2009: /* THIN SPACE */
- case 0x200A: /* HAIR SPACE */
- case 0x202f: /* NARROW NO-BREAK SPACE */
- case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
- case 0x3000: /* IDEOGRAPHIC SPACE */
- break;
- }
- }
- break;
-
- case OP_NOT_VSPACE:
- for (i = 1; i <= min; i++)
- {
- if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- GETCHARINC(c, eptr);
- switch(c)
- {
- default: break;
- case 0x0a: /* LF */
- case 0x0b: /* VT */
- case 0x0c: /* FF */
- case 0x0d: /* CR */
- case 0x85: /* NEL */
- case 0x2028: /* LINE SEPARATOR */
- case 0x2029: /* PARAGRAPH SEPARATOR */
- RRETURN(MATCH_NOMATCH);
- }
- }
- break;
-
- case OP_VSPACE:
- for (i = 1; i <= min; i++)
- {
- if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- GETCHARINC(c, eptr);
- switch(c)
- {
- default: RRETURN(MATCH_NOMATCH);
- case 0x0a: /* LF */
- case 0x0b: /* VT */
- case 0x0c: /* FF */
- case 0x0d: /* CR */
- case 0x85: /* NEL */
- case 0x2028: /* LINE SEPARATOR */
- case 0x2029: /* PARAGRAPH SEPARATOR */
- break;
- }
- }
- break;
-
- case OP_NOT_DIGIT:
- for (i = 1; i <= min; i++)
- {
- if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- GETCHARINC(c, eptr);
- if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
- RRETURN(MATCH_NOMATCH);
- }
- break;
-
- case OP_DIGIT:
- for (i = 1; i <= min; i++)
- {
- if (eptr >= md->end_subject ||
- *eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
- RRETURN(MATCH_NOMATCH);
- /* No need to skip more bytes - we know it's a 1-byte character */
- }
- break;
-
- case OP_NOT_WHITESPACE:
- for (i = 1; i <= min; i++)
- {
- if (eptr >= md->end_subject ||
- (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0))
- RRETURN(MATCH_NOMATCH);
- while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
- }
- break;
-
- case OP_WHITESPACE:
- for (i = 1; i <= min; i++)
- {
- if (eptr >= md->end_subject ||
- *eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
- RRETURN(MATCH_NOMATCH);
- /* No need to skip more bytes - we know it's a 1-byte character */
- }
- break;
-
- case OP_NOT_WORDCHAR:
- for (i = 1; i <= min; i++)
- {
- if (eptr >= md->end_subject ||
- (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0))
- RRETURN(MATCH_NOMATCH);
- while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
- }
- break;
-
- case OP_WORDCHAR:
- for (i = 1; i <= min; i++)
- {
- if (eptr >= md->end_subject ||
- *eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
- RRETURN(MATCH_NOMATCH);
- /* No need to skip more bytes - we know it's a 1-byte character */
- }
- break;
-
- default:
- RRETURN(PCRE_ERROR_INTERNAL);
- } /* End switch(ctype) */
-
- else
-#endif /* SUPPORT_UTF8 */
-
- /* Code for the non-UTF-8 case for minimum matching of operators other
- than OP_PROP and OP_NOTPROP. We can assume that there are the minimum
- number of bytes present, as this was tested above. */
-
- switch(ctype)
- {
- case OP_ANY:
- for (i = 1; i <= min; i++)
- {
- if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
- eptr++;
- }
- break;
-
- case OP_ALLANY:
- eptr += min;
- break;
-
- case OP_ANYBYTE:
- eptr += min;
- break;
-
- /* Because of the CRLF case, we can't assume the minimum number of
- bytes are present in this case. */
-
- case OP_ANYNL:
- for (i = 1; i <= min; i++)
- {
- if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- switch(*eptr++)
- {
- default: RRETURN(MATCH_NOMATCH);
- case 0x000d:
- if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
- break;
- case 0x000a:
- break;
-
- case 0x000b:
- case 0x000c:
- case 0x0085:
- if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
- break;
- }
- }
- break;
-
- case OP_NOT_HSPACE:
- for (i = 1; i <= min; i++)
- {
- if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- switch(*eptr++)
- {
- default: break;
- case 0x09: /* HT */
- case 0x20: /* SPACE */
- case 0xa0: /* NBSP */
- RRETURN(MATCH_NOMATCH);
- }
- }
- break;
-
- case OP_HSPACE:
- for (i = 1; i <= min; i++)
- {
- if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- switch(*eptr++)
- {
- default: RRETURN(MATCH_NOMATCH);
- case 0x09: /* HT */
- case 0x20: /* SPACE */
- case 0xa0: /* NBSP */
- break;
- }
- }
- break;
-
- case OP_NOT_VSPACE:
- for (i = 1; i <= min; i++)
- {
- if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- switch(*eptr++)
- {
- default: break;
- case 0x0a: /* LF */
- case 0x0b: /* VT */
- case 0x0c: /* FF */
- case 0x0d: /* CR */
- case 0x85: /* NEL */
- RRETURN(MATCH_NOMATCH);
- }
- }
- break;
-
- case OP_VSPACE:
- for (i = 1; i <= min; i++)
- {
- if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- switch(*eptr++)
- {
- default: RRETURN(MATCH_NOMATCH);
- case 0x0a: /* LF */
- case 0x0b: /* VT */
- case 0x0c: /* FF */
- case 0x0d: /* CR */
- case 0x85: /* NEL */
- break;
- }
- }
- break;
-
- case OP_NOT_DIGIT:
- for (i = 1; i <= min; i++)
- if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
- break;
-
- case OP_DIGIT:
- for (i = 1; i <= min; i++)
- if ((md->ctypes[*eptr++] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
- break;
-
- case OP_NOT_WHITESPACE:
- for (i = 1; i <= min; i++)
- if ((md->ctypes[*eptr++] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
- break;
-
- case OP_WHITESPACE:
- for (i = 1; i <= min; i++)
- if ((md->ctypes[*eptr++] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
- break;
-
- case OP_NOT_WORDCHAR:
- for (i = 1; i <= min; i++)
- if ((md->ctypes[*eptr++] & ctype_word) != 0)
- RRETURN(MATCH_NOMATCH);
- break;
-
- case OP_WORDCHAR:
- for (i = 1; i <= min; i++)
- if ((md->ctypes[*eptr++] & ctype_word) == 0)
- RRETURN(MATCH_NOMATCH);
- break;
-
- default:
- RRETURN(PCRE_ERROR_INTERNAL);
- }
- }
-
- /* If min = max, continue at the same level without recursing */
-
- if (min == max) continue;
-
- /* If minimizing, we have to test the rest of the pattern before each
- subsequent match. Again, separate the UTF-8 case for speed, and also
- separate the UCP cases. */
-
- if (minimize)
- {
-#ifdef SUPPORT_UCP
- if (prop_type >= 0)
- {
- switch(prop_type)
- {
- case PT_ANY:
- for (fi = min;; fi++)
- {
- RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM36);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- GETCHARINC(c, eptr);
- if (prop_fail_result) RRETURN(MATCH_NOMATCH);
- }
- /* Control never gets here */
-
- case PT_LAMP:
- for (fi = min;; fi++)
- {
- RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM37);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- GETCHARINC(c, eptr);
- prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
- if ((prop_chartype == ucp_Lu ||
- prop_chartype == ucp_Ll ||
- prop_chartype == ucp_Lt) == prop_fail_result)
- RRETURN(MATCH_NOMATCH);
- }
- /* Control never gets here */
-
- case PT_GC:
- for (fi = min;; fi++)
- {
- RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM38);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- GETCHARINC(c, eptr);
- prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
- if ((prop_category == prop_value) == prop_fail_result)
- RRETURN(MATCH_NOMATCH);
- }
- /* Control never gets here */
-
- case PT_PC:
- for (fi = min;; fi++)
- {
- RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM39);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- GETCHARINC(c, eptr);
- prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
- if ((prop_chartype == prop_value) == prop_fail_result)
- RRETURN(MATCH_NOMATCH);
- }
- /* Control never gets here */
-
- case PT_SC:
- for (fi = min;; fi++)
- {
- RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM40);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- GETCHARINC(c, eptr);
- prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
- if ((prop_script == prop_value) == prop_fail_result)
- RRETURN(MATCH_NOMATCH);
- }
- /* Control never gets here */
-
- default:
- RRETURN(PCRE_ERROR_INTERNAL);
- }
- }
-
- /* Match extended Unicode sequences. We will get here only if the
- support is in the binary; otherwise a compile-time error occurs. */
-
- else if (ctype == OP_EXTUNI)
- {
- for (fi = min;; fi++)
- {
- RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM41);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- GETCHARINCTEST(c, eptr);
- prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
- if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
- while (eptr < md->end_subject)
- {
- int len = 1;
- if (!utf8) c = *eptr; else
- {
- GETCHARLEN(c, eptr, len);
- }
- prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
- if (prop_category != ucp_M) break;
- eptr += len;
- }
- }
- }
-
- else
-#endif /* SUPPORT_UCP */
-
-#ifdef SUPPORT_UTF8
- /* UTF-8 mode */
- if (utf8)
- {
- for (fi = min;; fi++)
- {
- RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM42);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (fi >= max || eptr >= md->end_subject ||
- (ctype == OP_ANY && IS_NEWLINE(eptr)))
- RRETURN(MATCH_NOMATCH);
-
- GETCHARINC(c, eptr);
- switch(ctype)
- {
- case OP_ANY: /* This is the non-NL case */
- case OP_ALLANY:
- case OP_ANYBYTE:
- break;
-
- case OP_ANYNL:
- switch(c)
- {
- default: RRETURN(MATCH_NOMATCH);
- case 0x000d:
- if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
- break;
- case 0x000a:
- break;
-
- case 0x000b:
- case 0x000c:
- case 0x0085:
- case 0x2028:
- case 0x2029:
- if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
- break;
- }
- break;
-
- case OP_NOT_HSPACE:
- switch(c)
- {
- default: break;
- case 0x09: /* HT */
- case 0x20: /* SPACE */
- case 0xa0: /* NBSP */
- case 0x1680: /* OGHAM SPACE MARK */
- case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
- case 0x2000: /* EN QUAD */
- case 0x2001: /* EM QUAD */
- case 0x2002: /* EN SPACE */
- case 0x2003: /* EM SPACE */
- case 0x2004: /* THREE-PER-EM SPACE */
- case 0x2005: /* FOUR-PER-EM SPACE */
- case 0x2006: /* SIX-PER-EM SPACE */
- case 0x2007: /* FIGURE SPACE */
- case 0x2008: /* PUNCTUATION SPACE */
- case 0x2009: /* THIN SPACE */
- case 0x200A: /* HAIR SPACE */
- case 0x202f: /* NARROW NO-BREAK SPACE */
- case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
- case 0x3000: /* IDEOGRAPHIC SPACE */
- RRETURN(MATCH_NOMATCH);
- }
- break;
-
- case OP_HSPACE:
- switch(c)
- {
- default: RRETURN(MATCH_NOMATCH);
- case 0x09: /* HT */
- case 0x20: /* SPACE */
- case 0xa0: /* NBSP */
- case 0x1680: /* OGHAM SPACE MARK */
- case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
- case 0x2000: /* EN QUAD */
- case 0x2001: /* EM QUAD */
- case 0x2002: /* EN SPACE */
- case 0x2003: /* EM SPACE */
- case 0x2004: /* THREE-PER-EM SPACE */
- case 0x2005: /* FOUR-PER-EM SPACE */
- case 0x2006: /* SIX-PER-EM SPACE */
- case 0x2007: /* FIGURE SPACE */
- case 0x2008: /* PUNCTUATION SPACE */
- case 0x2009: /* THIN SPACE */
- case 0x200A: /* HAIR SPACE */
- case 0x202f: /* NARROW NO-BREAK SPACE */
- case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
- case 0x3000: /* IDEOGRAPHIC SPACE */
- break;
- }
- break;
-
- case OP_NOT_VSPACE:
- switch(c)
- {
- default: break;
- case 0x0a: /* LF */
- case 0x0b: /* VT */
- case 0x0c: /* FF */
- case 0x0d: /* CR */
- case 0x85: /* NEL */
- case 0x2028: /* LINE SEPARATOR */
- case 0x2029: /* PARAGRAPH SEPARATOR */
- RRETURN(MATCH_NOMATCH);
- }
- break;
-
- case OP_VSPACE:
- switch(c)
- {
- default: RRETURN(MATCH_NOMATCH);
- case 0x0a: /* LF */
- case 0x0b: /* VT */
- case 0x0c: /* FF */
- case 0x0d: /* CR */
- case 0x85: /* NEL */
- case 0x2028: /* LINE SEPARATOR */
- case 0x2029: /* PARAGRAPH SEPARATOR */
- break;
- }
- break;
-
- case OP_NOT_DIGIT:
- if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
- RRETURN(MATCH_NOMATCH);
- break;
-
- case OP_DIGIT:
- if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
- RRETURN(MATCH_NOMATCH);
- break;
-
- case OP_NOT_WHITESPACE:
- if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
- RRETURN(MATCH_NOMATCH);
- break;
-
- case OP_WHITESPACE:
- if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
- RRETURN(MATCH_NOMATCH);
- break;
-
- case OP_NOT_WORDCHAR:
- if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
- RRETURN(MATCH_NOMATCH);
- break;
-
- case OP_WORDCHAR:
- if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
- RRETURN(MATCH_NOMATCH);
- break;
-
- default:
- RRETURN(PCRE_ERROR_INTERNAL);
- }
- }
- }
- else
-#endif
- /* Not UTF-8 mode */
- {
- for (fi = min;; fi++)
- {
- RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM43);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (fi >= max || eptr >= md->end_subject ||
- (ctype == OP_ANY && IS_NEWLINE(eptr)))
- RRETURN(MATCH_NOMATCH);
-
- c = *eptr++;
- switch(ctype)
- {
- case OP_ANY: /* This is the non-NL case */
- case OP_ALLANY:
- case OP_ANYBYTE:
- break;
-
- case OP_ANYNL:
- switch(c)
- {
- default: RRETURN(MATCH_NOMATCH);
- case 0x000d:
- if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
- break;
-
- case 0x000a:
- break;
-
- case 0x000b:
- case 0x000c:
- case 0x0085:
- if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
- break;
- }
- break;
-
- case OP_NOT_HSPACE:
- switch(c)
- {
- default: break;
- case 0x09: /* HT */
- case 0x20: /* SPACE */
- case 0xa0: /* NBSP */
- RRETURN(MATCH_NOMATCH);
- }
- break;
-
- case OP_HSPACE:
- switch(c)
- {
- default: RRETURN(MATCH_NOMATCH);
- case 0x09: /* HT */
- case 0x20: /* SPACE */
- case 0xa0: /* NBSP */
- break;
- }
- break;
-
- case OP_NOT_VSPACE:
- switch(c)
- {
- default: break;
- case 0x0a: /* LF */
- case 0x0b: /* VT */
- case 0x0c: /* FF */
- case 0x0d: /* CR */
- case 0x85: /* NEL */
- RRETURN(MATCH_NOMATCH);
- }
- break;
-
- case OP_VSPACE:
- switch(c)
- {
- default: RRETURN(MATCH_NOMATCH);
- case 0x0a: /* LF */
- case 0x0b: /* VT */
- case 0x0c: /* FF */
- case 0x0d: /* CR */
- case 0x85: /* NEL */
- break;
- }
- break;
-
- case OP_NOT_DIGIT:
- if ((md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
- break;
-
- case OP_DIGIT:
- if ((md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
- break;
-
- case OP_NOT_WHITESPACE:
- if ((md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
- break;
-
- case OP_WHITESPACE:
- if ((md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
- break;
-
- case OP_NOT_WORDCHAR:
- if ((md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
- break;
-
- case OP_WORDCHAR:
- if ((md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
- break;
-
- default:
- RRETURN(PCRE_ERROR_INTERNAL);
- }
- }
- }
- /* Control never gets here */
- }
-
- /* If maximizing, it is worth using inline code for speed, doing the type
- test once at the start (i.e. keep it out of the loop). Again, keep the
- UTF-8 and UCP stuff separate. */
-
- else
- {
- pp = eptr; /* Remember where we started */
-
-#ifdef SUPPORT_UCP
- if (prop_type >= 0)
- {
- switch(prop_type)
- {
- case PT_ANY:
- for (i = min; i < max; i++)
- {
- int len = 1;
- if (eptr >= md->end_subject) break;
- GETCHARLEN(c, eptr, len);
- if (prop_fail_result) break;
- eptr+= len;
- }
- break;
-
- case PT_LAMP:
- for (i = min; i < max; i++)
- {
- int len = 1;
- if (eptr >= md->end_subject) break;
- GETCHARLEN(c, eptr, len);
- prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
- if ((prop_chartype == ucp_Lu ||
- prop_chartype == ucp_Ll ||
- prop_chartype == ucp_Lt) == prop_fail_result)
- break;
- eptr+= len;
- }
- break;
-
- case PT_GC:
- for (i = min; i < max; i++)
- {
- int len = 1;
- if (eptr >= md->end_subject) break;
- GETCHARLEN(c, eptr, len);
- prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
- if ((prop_category == prop_value) == prop_fail_result)
- break;
- eptr+= len;
- }
- break;
-
- case PT_PC:
- for (i = min; i < max; i++)
- {
- int len = 1;
- if (eptr >= md->end_subject) break;
- GETCHARLEN(c, eptr, len);
- prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
- if ((prop_chartype == prop_value) == prop_fail_result)
- break;
- eptr+= len;
- }
- break;
-
- case PT_SC:
- for (i = min; i < max; i++)
- {
- int len = 1;
- if (eptr >= md->end_subject) break;
- GETCHARLEN(c, eptr, len);
- prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
- if ((prop_script == prop_value) == prop_fail_result)
- break;
- eptr+= len;
- }
- break;
- }
-
- /* eptr is now past the end of the maximum run */
-
- if (possessive) continue;
- for(;;)
- {
- RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM44);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (eptr-- == pp) break; /* Stop if tried at original pos */
- if (utf8) BACKCHAR(eptr);
- }
- }
-
- /* Match extended Unicode sequences. We will get here only if the
- support is in the binary; otherwise a compile-time error occurs. */
-
- else if (ctype == OP_EXTUNI)
- {
- for (i = min; i < max; i++)
- {
- if (eptr >= md->end_subject) break;
- GETCHARINCTEST(c, eptr);
- prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
- if (prop_category == ucp_M) break;
- while (eptr < md->end_subject)
- {
- int len = 1;
- if (!utf8) c = *eptr; else
- {
- GETCHARLEN(c, eptr, len);
- }
- prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
- if (prop_category != ucp_M) break;
- eptr += len;
- }
- }
-
- /* eptr is now past the end of the maximum run */
-
- if (possessive) continue;
- for(;;)
- {
- RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM45);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (eptr-- == pp) break; /* Stop if tried at original pos */
- for (;;) /* Move back over one extended */
- {
- int len = 1;
- if (!utf8) c = *eptr; else
- {
- BACKCHAR(eptr);
- GETCHARLEN(c, eptr, len);
- }
- prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
- if (prop_category != ucp_M) break;
- eptr--;
- }
- }
- }
-
- else
-#endif /* SUPPORT_UCP */
-
-#ifdef SUPPORT_UTF8
- /* UTF-8 mode */
-
- if (utf8)
- {
- switch(ctype)
- {
- case OP_ANY:
- if (max < INT_MAX)
- {
- for (i = min; i < max; i++)
- {
- if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
- eptr++;
- while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
- }
- }
-
- /* Handle unlimited UTF-8 repeat */
-
- else
- {
- for (i = min; i < max; i++)
- {
- if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
- eptr++;
- while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
- }
- }
- break;
-
- case OP_ALLANY:
- if (max < INT_MAX)
- {
- for (i = min; i < max; i++)
- {
- if (eptr >= md->end_subject) break;
- eptr++;
- while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
- }
- }
- else eptr = md->end_subject; /* Unlimited UTF-8 repeat */
- break;
-
- /* The byte case is the same as non-UTF8 */
-
- case OP_ANYBYTE:
- c = max - min;
- if (c > (unsigned int)(md->end_subject - eptr))
- c = md->end_subject - eptr;
- eptr += c;
- break;
-
- case OP_ANYNL:
- for (i = min; i < max; i++)
- {
- int len = 1;
- if (eptr >= md->end_subject) break;
- GETCHARLEN(c, eptr, len);
- if (c == 0x000d)
- {
- if (++eptr >= md->end_subject) break;
- if (*eptr == 0x000a) eptr++;
- }
- else
- {
- if (c != 0x000a &&
- (md->bsr_anycrlf ||
- (c != 0x000b && c != 0x000c &&
- c != 0x0085 && c != 0x2028 && c != 0x2029)))
- break;
- eptr += len;
- }
- }
- break;
-
- case OP_NOT_HSPACE:
- case OP_HSPACE:
- for (i = min; i < max; i++)
- {
- BOOL gotspace;
- int len = 1;
- if (eptr >= md->end_subject) break;
- GETCHARLEN(c, eptr, len);
- switch(c)
- {
- default: gotspace = FALSE; break;
- case 0x09: /* HT */
- case 0x20: /* SPACE */
- case 0xa0: /* NBSP */
- case 0x1680: /* OGHAM SPACE MARK */
- case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
- case 0x2000: /* EN QUAD */
- case 0x2001: /* EM QUAD */
- case 0x2002: /* EN SPACE */
- case 0x2003: /* EM SPACE */
- case 0x2004: /* THREE-PER-EM SPACE */
- case 0x2005: /* FOUR-PER-EM SPACE */
- case 0x2006: /* SIX-PER-EM SPACE */
- case 0x2007: /* FIGURE SPACE */
- case 0x2008: /* PUNCTUATION SPACE */
- case 0x2009: /* THIN SPACE */
- case 0x200A: /* HAIR SPACE */
- case 0x202f: /* NARROW NO-BREAK SPACE */
- case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
- case 0x3000: /* IDEOGRAPHIC SPACE */
- gotspace = TRUE;
- break;
- }
- if (gotspace == (ctype == OP_NOT_HSPACE)) break;
- eptr += len;
- }
- break;
-
- case OP_NOT_VSPACE:
- case OP_VSPACE:
- for (i = min; i < max; i++)
- {
- BOOL gotspace;
- int len = 1;
- if (eptr >= md->end_subject) break;
- GETCHARLEN(c, eptr, len);
- switch(c)
- {
- default: gotspace = FALSE; break;
- case 0x0a: /* LF */
- case 0x0b: /* VT */
- case 0x0c: /* FF */
- case 0x0d: /* CR */
- case 0x85: /* NEL */
- case 0x2028: /* LINE SEPARATOR */
- case 0x2029: /* PARAGRAPH SEPARATOR */
- gotspace = TRUE;
- break;
- }
- if (gotspace == (ctype == OP_NOT_VSPACE)) break;
- eptr += len;
- }
- break;
-
- case OP_NOT_DIGIT:
- for (i = min; i < max; i++)
- {
- int len = 1;
- if (eptr >= md->end_subject) break;
- GETCHARLEN(c, eptr, len);
- if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
- eptr+= len;
- }
- break;
-
- case OP_DIGIT:
- for (i = min; i < max; i++)
- {
- int len = 1;
- if (eptr >= md->end_subject) break;
- GETCHARLEN(c, eptr, len);
- if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
- eptr+= len;
- }
- break;
-
- case OP_NOT_WHITESPACE:
- for (i = min; i < max; i++)
- {
- int len = 1;
- if (eptr >= md->end_subject) break;
- GETCHARLEN(c, eptr, len);
- if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
- eptr+= len;
- }
- break;
-
- case OP_WHITESPACE:
- for (i = min; i < max; i++)
- {
- int len = 1;
- if (eptr >= md->end_subject) break;
- GETCHARLEN(c, eptr, len);
- if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
- eptr+= len;
- }
- break;
-
- case OP_NOT_WORDCHAR:
- for (i = min; i < max; i++)
- {
- int len = 1;
- if (eptr >= md->end_subject) break;
- GETCHARLEN(c, eptr, len);
- if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
- eptr+= len;
- }
- break;
-
- case OP_WORDCHAR:
- for (i = min; i < max; i++)
- {
- int len = 1;
- if (eptr >= md->end_subject) break;
- GETCHARLEN(c, eptr, len);
- if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
- eptr+= len;
- }
- break;
-
- default:
- RRETURN(PCRE_ERROR_INTERNAL);
- }
-
- /* eptr is now past the end of the maximum run */
-
- if (possessive) continue;
- for(;;)
- {
- RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM46);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (eptr-- == pp) break; /* Stop if tried at original pos */
- BACKCHAR(eptr);
- }
- }
- else
-#endif /* SUPPORT_UTF8 */
-
- /* Not UTF-8 mode */
- {
- switch(ctype)
- {
- case OP_ANY:
- for (i = min; i < max; i++)
- {
- if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
- eptr++;
- }
- break;
-
- case OP_ALLANY:
- case OP_ANYBYTE:
- c = max - min;
- if (c > (unsigned int)(md->end_subject - eptr))
- c = md->end_subject - eptr;
- eptr += c;
- break;
-
- case OP_ANYNL:
- for (i = min; i < max; i++)
- {
- if (eptr >= md->end_subject) break;
- c = *eptr;
- if (c == 0x000d)
- {
- if (++eptr >= md->end_subject) break;
- if (*eptr == 0x000a) eptr++;
- }
- else
- {
- if (c != 0x000a &&
- (md->bsr_anycrlf ||
- (c != 0x000b && c != 0x000c && c != 0x0085)))
- break;
- eptr++;
- }
- }
- break;
-
- case OP_NOT_HSPACE:
- for (i = min; i < max; i++)
- {
- if (eptr >= md->end_subject) break;
- c = *eptr;
- if (c == 0x09 || c == 0x20 || c == 0xa0) break;
- eptr++;
- }
- break;
-
- case OP_HSPACE:
- for (i = min; i < max; i++)
- {
- if (eptr >= md->end_subject) break;
- c = *eptr;
- if (c != 0x09 && c != 0x20 && c != 0xa0) break;
- eptr++;
- }
- break;
-
- case OP_NOT_VSPACE:
- for (i = min; i < max; i++)
- {
- if (eptr >= md->end_subject) break;
- c = *eptr;
- if (c == 0x0a || c == 0x0b || c == 0x0c || c == 0x0d || c == 0x85)
- break;
- eptr++;
- }
- break;
-
- case OP_VSPACE:
- for (i = min; i < max; i++)
- {
- if (eptr >= md->end_subject) break;
- c = *eptr;
- if (c != 0x0a && c != 0x0b && c != 0x0c && c != 0x0d && c != 0x85)
- break;
- eptr++;
- }
- break;
-
- case OP_NOT_DIGIT:
- for (i = min; i < max; i++)
- {
- if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) != 0)
- break;
- eptr++;
- }
- break;
-
- case OP_DIGIT:
- for (i = min; i < max; i++)
- {
- if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) == 0)
- break;
- eptr++;
- }
- break;
-
- case OP_NOT_WHITESPACE:
- for (i = min; i < max; i++)
- {
- if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) != 0)
- break;
- eptr++;
- }
- break;
-
- case OP_WHITESPACE:
- for (i = min; i < max; i++)
- {
- if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) == 0)
- break;
- eptr++;
- }
- break;
-
- case OP_NOT_WORDCHAR:
- for (i = min; i < max; i++)
- {
- if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) != 0)
- break;
- eptr++;
- }
- break;
-
- case OP_WORDCHAR:
- for (i = min; i < max; i++)
- {
- if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) == 0)
- break;
- eptr++;
- }
- break;
-
- default:
- RRETURN(PCRE_ERROR_INTERNAL);
- }
-
- /* eptr is now past the end of the maximum run */
-
- if (possessive) continue;
- while (eptr >= pp)
- {
- RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM47);
- eptr--;
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- }
- }
-
- /* Get here if we can't make it match with any permitted repetitions */
-
- RRETURN(MATCH_NOMATCH);
- }
- /* Control never gets here */
-
- /* There's been some horrible disaster. Arrival here can only mean there is
- something seriously wrong in the code above or the OP_xxx definitions. */
-
- default:
- DPRINTF(("Unknown opcode %d\n", *ecode));
- RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
- }
-
- /* Do not stick any code in here without much thought; it is assumed
- that "continue" in the code above comes out to here to repeat the main
- loop. */
-
- } /* End of main loop */
-/* Control never reaches here */
-
-
-/* When compiling to use the heap rather than the stack for recursive calls to
-match(), the RRETURN() macro jumps here. The number that is saved in
-frame->Xwhere indicates which label we actually want to return to. */
-
-#ifdef NO_RECURSE
-#define LBL(val) case val: goto L_RM##val;
-HEAP_RETURN:
-switch (frame->Xwhere)
- {
- LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
- LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
- LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
- LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
- LBL(53) LBL(54)
-#ifdef SUPPORT_UTF8
- LBL(16) LBL(18) LBL(20) LBL(21) LBL(22) LBL(23) LBL(28) LBL(30)
- LBL(32) LBL(34) LBL(42) LBL(46)
-#ifdef SUPPORT_UCP
- LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
-#endif /* SUPPORT_UCP */
-#endif /* SUPPORT_UTF8 */
- default:
- DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
- return PCRE_ERROR_INTERNAL;
- }
-#undef LBL
-#endif /* NO_RECURSE */
-}
-
-
-/***************************************************************************
-****************************************************************************
- RECURSION IN THE match() FUNCTION
-
-Undefine all the macros that were defined above to handle this. */
-
-#ifdef NO_RECURSE
-#undef eptr
-#undef ecode
-#undef mstart
-#undef offset_top
-#undef ims
-#undef eptrb
-#undef flags
-
-#undef callpat
-#undef charptr
-#undef data
-#undef next
-#undef pp
-#undef prev
-#undef saved_eptr
-
-#undef new_recursive
-
-#undef cur_is_word
-#undef condition
-#undef prev_is_word
-
-#undef original_ims
-
-#undef ctype
-#undef length
-#undef max
-#undef min
-#undef number
-#undef offset
-#undef op
-#undef save_capture_last
-#undef save_offset1
-#undef save_offset2
-#undef save_offset3
-#undef stacksave
-
-#undef newptrb
-
-#endif
-
-/* These two are defined as macros in both cases */
-
-#undef fc
-#undef fi
-
-/***************************************************************************
-***************************************************************************/
-
-
-
-/*************************************************
-* Execute a Regular Expression *
-*************************************************/
-
-/* This function applies a compiled re to a subject string and picks out
-portions of the string if it matches. Two elements in the vector are set for
-each substring: the offsets to the start and end of the substring.
-
-Arguments:
- argument_re points to the compiled expression
- extra_data points to extra data or is NULL
- subject points to the subject string
- length length of subject string (may contain binary zeros)
- start_offset where to start in the subject string
- options option bits
- offsets points to a vector of ints to be filled in with offsets
- offsetcount the number of elements in the vector
-
-Returns: > 0 => success; value is the number of elements filled in
- = 0 => success, but offsets is not big enough
- -1 => failed to match
- < -1 => some kind of unexpected problem
-*/
-
-PCRE_EXP_DEFN int
-pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
- PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
- int offsetcount)
-{
-int rc, resetcount, ocount;
-int first_byte = -1;
-int req_byte = -1;
-int req_byte2 = -1;
-int newline;
-unsigned long int ims;
-BOOL using_temporary_offsets = FALSE;
-BOOL anchored;
-BOOL startline;
-BOOL firstline;
-BOOL first_byte_caseless = FALSE;
-BOOL req_byte_caseless = FALSE;
-BOOL utf8;
-match_data match_block;
-match_data *md = &match_block;
-const uschar *tables;
-const uschar *start_bits = NULL;
-USPTR start_match = (USPTR)subject + start_offset;
-USPTR end_subject;
-USPTR req_byte_ptr = start_match - 1;
-
-pcre_study_data internal_study;
-const pcre_study_data *study;
-
-real_pcre internal_re;
-const real_pcre *external_re = (const real_pcre *)argument_re;
-const real_pcre *re = external_re;
-
-/* Plausibility checks */
-
-if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
-if (re == NULL || subject == NULL ||
- (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
-if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
-
-/* Fish out the optional data from the extra_data structure, first setting
-the default values. */
-
-study = NULL;
-md->match_limit = MATCH_LIMIT;
-md->match_limit_recursion = MATCH_LIMIT_RECURSION;
-md->callout_data = NULL;
-
-/* The table pointer is always in native byte order. */
-
-tables = external_re->tables;
-
-if (extra_data != NULL)
- {
- register unsigned int flags = extra_data->flags;
- if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
- study = (const pcre_study_data *)extra_data->study_data;
- if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
- md->match_limit = extra_data->match_limit;
- if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
- md->match_limit_recursion = extra_data->match_limit_recursion;
- if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
- md->callout_data = extra_data->callout_data;
- if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
- }
-
-/* If the exec call supplied NULL for tables, use the inbuilt ones. This
-is a feature that makes it possible to save compiled regex and re-use them
-in other programs later. */
-
-if (tables == NULL) tables = _pcre_default_tables;
-
-/* Check that the first field in the block is the magic number. If it is not,
-test for a regex that was compiled on a host of opposite endianness. If this is
-the case, flipped values are put in internal_re and internal_study if there was
-study data too. */
-
-if (re->magic_number != MAGIC_NUMBER)
- {
- re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
- if (re == NULL) return PCRE_ERROR_BADMAGIC;
- if (study != NULL) study = &internal_study;
- }
-
-/* Set up other data */
-
-anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
-startline = (re->flags & PCRE_STARTLINE) != 0;
-firstline = (re->options & PCRE_FIRSTLINE) != 0;
-
-/* The code starts after the real_pcre block and the capture name table. */
-
-md->start_code = (const uschar *)external_re + re->name_table_offset +
- re->name_count * re->name_entry_size;
-
-md->start_subject = (USPTR)subject;
-md->start_offset = start_offset;
-md->end_subject = md->start_subject + length;
-end_subject = md->end_subject;
-
-md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
-utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0;
-md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
-
-md->notbol = (options & PCRE_NOTBOL) != 0;
-md->noteol = (options & PCRE_NOTEOL) != 0;
-md->notempty = (options & PCRE_NOTEMPTY) != 0;
-md->partial = (options & PCRE_PARTIAL) != 0;
-md->hitend = FALSE;
-
-md->recursive = NULL; /* No recursion at top level */
-
-md->lcc = tables + lcc_offset;
-md->ctypes = tables + ctypes_offset;
-
-/* Handle different \R options. */
-
-switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
- {
- case 0:
- if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
- md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
- else
-#ifdef BSR_ANYCRLF
- md->bsr_anycrlf = TRUE;
-#else
- md->bsr_anycrlf = FALSE;
-#endif
- break;
-
- case PCRE_BSR_ANYCRLF:
- md->bsr_anycrlf = TRUE;
- break;
-
- case PCRE_BSR_UNICODE:
- md->bsr_anycrlf = FALSE;
- break;
-
- default: return PCRE_ERROR_BADNEWLINE;
- }
-
-/* Handle different types of newline. The three bits give eight cases. If
-nothing is set at run time, whatever was used at compile time applies. */
-
-switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
- (pcre_uint32)options) & PCRE_NEWLINE_BITS)
- {
- case 0: newline = NEWLINE; break; /* Compile-time default */
- case PCRE_NEWLINE_CR: newline = '\r'; break;
- case PCRE_NEWLINE_LF: newline = '\n'; break;
- case PCRE_NEWLINE_CR+
- PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
- case PCRE_NEWLINE_ANY: newline = -1; break;
- case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
- default: return PCRE_ERROR_BADNEWLINE;
- }
-
-if (newline == -2)
- {
- md->nltype = NLTYPE_ANYCRLF;
- }
-else if (newline < 0)
- {
- md->nltype = NLTYPE_ANY;
- }
-else
- {
- md->nltype = NLTYPE_FIXED;
- if (newline > 255)
- {
- md->nllen = 2;
- md->nl[0] = (newline >> 8) & 255;
- md->nl[1] = newline & 255;
- }
- else
- {
- md->nllen = 1;
- md->nl[0] = newline;
- }
- }
-
-/* Partial matching is supported only for a restricted set of regexes at the
-moment. */
-
-if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
- return PCRE_ERROR_BADPARTIAL;
-
-/* Check a UTF-8 string if required. Unfortunately there's no way of passing
-back the character offset. */
-
-#ifdef SUPPORT_UTF8
-if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
- {
- if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
- return PCRE_ERROR_BADUTF8;
- if (start_offset > 0 && start_offset < length)
- {
- int tb = ((uschar *)subject)[start_offset];
- if (tb > 127)
- {
- tb &= 0xc0;
- if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
- }
- }
- }
-#endif
-
-/* The ims options can vary during the matching as a result of the presence
-of (?ims) items in the pattern. They are kept in a local variable so that
-restoring at the exit of a group is easy. */
-
-ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
-
-/* If the expression has got more back references than the offsets supplied can
-hold, we get a temporary chunk of working store to use during the matching.
-Otherwise, we can use the vector supplied, rounding down its size to a multiple
-of 3. */
-
-ocount = offsetcount - (offsetcount % 3);
-
-if (re->top_backref > 0 && re->top_backref >= ocount/3)
- {
- ocount = re->top_backref * 3 + 3;
- md->offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
- if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
- using_temporary_offsets = TRUE;
- DPRINTF(("Got memory to hold back references\n"));
- }
-else md->offset_vector = offsets;
-
-md->offset_end = ocount;
-md->offset_max = (2*ocount)/3;
-md->offset_overflow = FALSE;
-md->capture_last = -1;
-
-/* Compute the minimum number of offsets that we need to reset each time. Doing
-this makes a huge difference to execution time when there aren't many brackets
-in the pattern. */
-
-resetcount = 2 + re->top_bracket * 2;
-if (resetcount > offsetcount) resetcount = ocount;
-
-/* Reset the working variable associated with each extraction. These should
-never be used unless previously set, but they get saved and restored, and so we
-initialize them to avoid reading uninitialized locations. */
-
-if (md->offset_vector != NULL)
- {
- register int *iptr = md->offset_vector + ocount;
- register int *iend = iptr - resetcount/2 + 1;
- while (--iptr >= iend) *iptr = -1;
- }
-
-/* Set up the first character to match, if available. The first_byte value is
-never set for an anchored regular expression, but the anchoring may be forced
-at run time, so we have to test for anchoring. The first char may be unset for
-an unanchored pattern, of course. If there's no first char and the pattern was
-studied, there may be a bitmap of possible first characters. */
-
-if (!anchored)
- {
- if ((re->flags & PCRE_FIRSTSET) != 0)
- {
- first_byte = re->first_byte & 255;
- if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
- first_byte = md->lcc[first_byte];
- }
- else
- if (!startline && study != NULL &&
- (study->options & PCRE_STUDY_MAPPED) != 0)
- start_bits = study->start_bits;
- }
-
-/* For anchored or unanchored matches, there may be a "last known required
-character" set. */
-
-if ((re->flags & PCRE_REQCHSET) != 0)
- {
- req_byte = re->req_byte & 255;
- req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
- req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */
- }
-
-
-/* ==========================================================================*/
-
-/* Loop for handling unanchored repeated matching attempts; for anchored regexs
-the loop runs just once. */
-
-for(;;)
- {
- USPTR save_end_subject = end_subject;
- USPTR new_start_match;
-
- /* Reset the maximum number of extractions we might see. */
-
- if (md->offset_vector != NULL)
- {
- register int *iptr = md->offset_vector;
- register int *iend = iptr + resetcount;
- while (iptr < iend) *iptr++ = -1;
- }
-
- /* Advance to a unique first char if possible. If firstline is TRUE, the
- start of the match is constrained to the first line of a multiline string.
- That is, the match must be before or at the first newline. Implement this by
- temporarily adjusting end_subject so that we stop scanning at a newline. If
- the match fails at the newline, later code breaks this loop. */
-
- if (firstline)
- {
- USPTR t = start_match;
- while (t < md->end_subject && !IS_NEWLINE(t)) t++;
- end_subject = t;
- }
-
- /* Now test for a unique first byte */
-
- if (first_byte >= 0)
- {
- if (first_byte_caseless)
- while (start_match < end_subject &&
- md->lcc[*start_match] != first_byte)
- { NEXTCHAR(start_match); }
- else
- while (start_match < end_subject && *start_match != first_byte)
- { NEXTCHAR(start_match); }
- }
-
- /* Or to just after a linebreak for a multiline match if possible */
-
- else if (startline)
- {
- if (start_match > md->start_subject + start_offset)
- {
- while (start_match <= end_subject && !WAS_NEWLINE(start_match))
- { NEXTCHAR(start_match); }
-
- /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
- and we are now at a LF, advance the match position by one more character.
- */
-
- if (start_match[-1] == '\r' &&
- (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
- start_match < end_subject &&
- *start_match == '\n')
- start_match++;
- }
- }
-
- /* Or to a non-unique first char after study */
-
- else if (start_bits != NULL)
- {
- while (start_match < end_subject)
- {
- register unsigned int c = *start_match;
- if ((start_bits[c/8] & (1 << (c&7))) == 0)
- { NEXTCHAR(start_match); }
- else break;
- }
- }
-
- /* Restore fudged end_subject */
-
- end_subject = save_end_subject;
-
-#ifdef DEBUG /* Sigh. Some compilers never learn. */
- printf(">>>> Match against: ");
- pchars(start_match, end_subject - start_match, TRUE, md);
- printf("\n");
-#endif
-
- /* If req_byte is set, we know that that character must appear in the subject
- for the match to succeed. If the first character is set, req_byte must be
- later in the subject; otherwise the test starts at the match point. This
- optimization can save a huge amount of backtracking in patterns with nested
- unlimited repeats that aren't going to match. Writing separate code for
- cased/caseless versions makes it go faster, as does using an autoincrement
- and backing off on a match.
-
- HOWEVER: when the subject string is very, very long, searching to its end can
- take a long time, and give bad performance on quite ordinary patterns. This
- showed up when somebody was matching something like /^\d+C/ on a 32-megabyte
- string... so we don't do this when the string is sufficiently long.
-
- ALSO: this processing is disabled when partial matching is requested.
- */
-
- if (req_byte >= 0 &&
- end_subject - start_match < REQ_BYTE_MAX &&
- !md->partial)
- {
- register USPTR p = start_match + ((first_byte >= 0)? 1 : 0);
-
- /* We don't need to repeat the search if we haven't yet reached the
- place we found it at last time. */
-
- if (p > req_byte_ptr)
- {
- if (req_byte_caseless)
- {
- while (p < end_subject)
- {
- register int pp = *p++;
- if (pp == req_byte || pp == req_byte2) { p--; break; }
- }
- }
- else
- {
- while (p < end_subject)
- {
- if (*p++ == req_byte) { p--; break; }
- }
- }
-
- /* If we can't find the required character, break the matching loop,
- forcing a match failure. */
-
- if (p >= end_subject)
- {
- rc = MATCH_NOMATCH;
- break;
- }
-
- /* If we have found the required character, save the point where we
- found it, so that we don't search again next time round the loop if
- the start hasn't passed this character yet. */
-
- req_byte_ptr = p;
- }
- }
-
- /* OK, we can now run the match. */
-
- md->start_match_ptr = start_match;
- md->match_call_count = 0;
- rc = match(start_match, md->start_code, start_match, 2, md, ims, NULL, 0, 0);
-
- switch(rc)
- {
- /* NOMATCH and PRUNE advance by one character. THEN at this level acts
- exactly like PRUNE. */
-
- case MATCH_NOMATCH:
- case MATCH_PRUNE:
- case MATCH_THEN:
- new_start_match = start_match + 1;
-#ifdef SUPPORT_UTF8
- if (utf8)
- while(new_start_match < end_subject && (*new_start_match & 0xc0) == 0x80)
- new_start_match++;
-#endif
- break;
-
- /* SKIP passes back the next starting point explicitly. */
-
- case MATCH_SKIP:
- new_start_match = md->start_match_ptr;
- break;
-
- /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */
-
- case MATCH_COMMIT:
- rc = MATCH_NOMATCH;
- goto ENDLOOP;
-
- /* Any other return is some kind of error. */
-
- default:
- goto ENDLOOP;
- }
-
- /* Control reaches here for the various types of "no match at this point"
- result. Reset the code to MATCH_NOMATCH for subsequent checking. */
-
- rc = MATCH_NOMATCH;
-
- /* If PCRE_FIRSTLINE is set, the match must happen before or at the first
- newline in the subject (though it may continue over the newline). Therefore,
- if we have just failed to match, starting at a newline, do not continue. */
-
- if (firstline && IS_NEWLINE(start_match)) break;
-
- /* Advance to new matching position */
-
- start_match = new_start_match;
-
- /* Break the loop if the pattern is anchored or if we have passed the end of
- the subject. */
-
- if (anchored || start_match > end_subject) break;
-
- /* If we have just passed a CR and we are now at a LF, and the pattern does
- not contain any explicit matches for \r or \n, and the newline option is CRLF
- or ANY or ANYCRLF, advance the match position by one more character. */
-
- if (start_match[-1] == '\r' &&
- start_match < end_subject &&
- *start_match == '\n' &&
- (re->flags & PCRE_HASCRORLF) == 0 &&
- (md->nltype == NLTYPE_ANY ||
- md->nltype == NLTYPE_ANYCRLF ||
- md->nllen == 2))
- start_match++;
-
- } /* End of for(;;) "bumpalong" loop */
-
-/* ==========================================================================*/
-
-/* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping
-conditions is true:
-
-(1) The pattern is anchored or the match was failed by (*COMMIT);
-
-(2) We are past the end of the subject;
-
-(3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because
- this option requests that a match occur at or before the first newline in
- the subject.
-
-When we have a match and the offset vector is big enough to deal with any
-backreferences, captured substring offsets will already be set up. In the case
-where we had to get some local store to hold offsets for backreference
-processing, copy those that we can. In this case there need not be overflow if
-certain parts of the pattern were not used, even though there are more
-capturing parentheses than vector slots. */
-
-ENDLOOP:
-
-if (rc == MATCH_MATCH)
- {
- if (using_temporary_offsets)
- {
- if (offsetcount >= 4)
- {
- memcpy(offsets + 2, md->offset_vector + 2,
- (offsetcount - 2) * sizeof(int));
- DPRINTF(("Copied offsets from temporary memory\n"));
- }
- if (md->end_offset_top > offsetcount) md->offset_overflow = TRUE;
- DPRINTF(("Freeing temporary memory\n"));
- (pcre_free)(md->offset_vector);
- }
-
- /* Set the return code to the number of captured strings, or 0 if there are
- too many to fit into the vector. */
-
- rc = md->offset_overflow? 0 : md->end_offset_top/2;
-
- /* If there is space, set up the whole thing as substring 0. The value of
- md->start_match_ptr might be modified if \K was encountered on the success
- matching path. */
-
- if (offsetcount < 2) rc = 0; else
- {
- offsets[0] = md->start_match_ptr - md->start_subject;
- offsets[1] = md->end_match_ptr - md->start_subject;
- }
-
- DPRINTF((">>>> returning %d\n", rc));
- return rc;
- }
-
-/* Control gets here if there has been an error, or if the overall match
-attempt has failed at all permitted starting positions. */
-
-if (using_temporary_offsets)
- {
- DPRINTF(("Freeing temporary memory\n"));
- (pcre_free)(md->offset_vector);
- }
-
-if (rc != MATCH_NOMATCH)
- {
- DPRINTF((">>>> error: returning %d\n", rc));
- return rc;
- }
-else if (md->partial && md->hitend)
- {
- DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
- return PCRE_ERROR_PARTIAL;
- }
-else
- {
- DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
- return PCRE_ERROR_NOMATCH;
- }
-}
-
-/* End of pcre_exec.c */