2 * IRC - Internet Relay Chat, common/match.c
3 * Copyright (C) 1990 Jarkko Oikarinen
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 1, or (at your option)
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
32 * Written by Run (carlo@runaway.xs4all.nl), 25-10-96
36 * From: Carlo Wood <carlo@runaway.xs4all.nl>
37 * Message-Id: <199609021026.MAA02393@runaway.xs4all.nl>
38 * Subject: [C-Com] Analysis for `mmatch' (was: gline4 problem)
39 * To: coder-com@mail.undernet.org (coder committee)
40 * Date: Mon, 2 Sep 1996 12:26:01 +0200 (MET DST)
42 * We need a new function `mmatch(const char *old_mask, const char *new_mask)'
43 * which returns `true' likewise the current `match' (start with copying it),
44 * but which treats '*' and '?' in `new_mask' differently (not "\*" and "\?" !)
45 * as follows: a '*' in `new_mask' does not match a '?' in `old_mask' and
46 * a '?' in `new_mask' does not match a '\?' in `old_mask'.
47 * And ofcourse... a '*' in `new_mask' does not match a '\*' in `old_mask'...
48 * And last but not least, '\?' and '\*' in `new_mask' now become one character.
51 int mmatch(const char *old_mask, const char *new_mask)
53 register const char *m = old_mask;
54 register const char *n = new_mask;
75 for (m--; (m > old_mask) && (*m == '?'); m--)
77 if ((*m == '*') && (m > old_mask) && (m[-1] != '\\'))
83 /* Added to `mmatch' : Because '\?' and '\*' now is one character: */
84 if ((*na == '\\') && ((na[1] == '*') || (na[1] == '?')))
95 if ((*m == '\\') && ((m[1] == '*') || (m[1] == '?')))
103 /* Added to `mmatch' : Because '\?' and '\*' now is one character: */
104 if ((*n == '\\') && ((n[1] == '*') || (n[1] == '?')))
113 * This `if' has been changed compared to match() to do the following:
115 * old (m) new (n) boolean expression
116 * * any (*m == '*' && !mq) ||
117 * ? any except '*' (*m == '?' && !mq && (*n != '*' || nq)) ||
118 * any except * or ? same as m (!((*m == '*' || *m == '?') && !mq) &&
119 * toLower(*m) == toLower(*n) &&
120 * !((mq && !nq) || (!mq && nq)))
122 * Here `any' also includes \* and \? !
124 * After reworking the boolean expressions, we get:
125 * (Optimized to use boolean shortcircuits, with most frequently occuring
126 * cases upfront (which took 2 hours!)).
128 if ((*m == '*' && !mq) ||
129 ((!mq || nq) && toLower(*m) == toLower(*n)) ||
130 (*m == '?' && !mq && (*n != '*' || nq)))
143 /* Added to `mmatch' : Because '\?' and '\*' now is one character: */
144 if ((*na == '\\') && ((na[1] == '*') || (na[1] == '?')))
153 * Compare if a given string (name) matches the given
154 * mask (which can contain wild cards: '*' - match any
155 * number of chars, '?' - match any single character.
164 * Rewritten by Andrea Cocito (Nemesi), November 1998.
168 /****************** Nemesi's match() ***************/
170 int match(const char *mask, const char *string)
172 register const char *m = mask, *s = string;
174 const char *bm, *bs; /* Will be reg anyway on a decent CPU/compiler */
176 /* Process the "head" of the mask, if any */
177 while ((ch = *m++) && (ch != '*'))
181 if (*m == '?' || *m == '*')
184 if (toLower(*s) != toLower(ch))
193 /* We got a star: quickly find if/where we match the next char */
195 bm = m; /* Next try rollback here */
204 continue; /* while */
206 if (*m == '?' || *m == '*')
209 goto break_while; /* C is structured ? */
213 return 0; /* mask ends with '*', we got it */
215 while (toLower(*s++) != ch)
218 bs = s; /* Next try start from here */
220 /* Check the rest of the "chunk" */
228 if (*m == '?' || *m == '*')
231 if (toLower(*s) != toLower(ch))
253 * Collapse a pattern string into minimal components.
254 * This particular version is "in place", so that it changes the pattern
255 * which is to be reduced to a "minimal" size.
257 * (C) Carlo Wood - 6 Oct 1998
258 * Speedup rewrite by Andrea Cocito, December 1998.
259 * Note that this new optimized alghoritm can *only* work in place.
262 char *collapse(char *mask)
264 register int star = 0;
265 register char *m = mask;
272 if ((*m == '*') && ((m[1] == '*') || (m[1] == '?')))
281 if (star && (*m != '?'))
287 if ((*m == '\\') && ((m[1] == '*') || (m[1] == '?')))
296 if ((*m == '\\') && ((m[1] == '*') || (m[1] == '?')))
306 ***************** Nemesi's matchcomp() / matchexec() **************
309 /* These functions allow the use of "compiled" masks, you compile a mask
310 * by means of matchcomp() that gets the plain text mask as input and writes
311 * its result in the memory locations addressed by the 3 parameters:
312 * - *cmask will contain the text of the compiled mask
313 * - *minlen will contain the lenght of the shortest string that can match
315 * - *charset will contain the minimal set of chars needed to match the mask
316 * You can pass NULL as *charset and it will be simply not returned, but you
317 * MUST pass valid pointers for *minlen and *cmask (wich must be big enough
318 * to contain the compiled mask text that is in the worst case as long as the
319 * text of the mask itself in plaintext format) and the return value of
320 * matchcomp() will be the number of chars actually written there (excluded
321 * the trailing zero). cmask can be == mask, matchcomp() can work in place.
322 * The {cmask, minlen} couple of values make the real compiled mask and
323 * need to be passed to the functions that use the compiled mask, if you pass
324 * the wrong minlen or something wrong in cmask to one of these expect a
325 * coredump. This means that when you record a compiled mask you must store
326 * *both* these values.
327 * Once compiled the mask can be used to match a string by means of
328 * matchexec(), it can be printed back to human-readable format by means
329 * of sprintmatch() or it can be compared to another compiled mask by means
330 * of mmexec() that will tell if it completely overrides that mask (a lot like
331 * what mmatch() does for plain text masks).
332 * You can gain a lot of speed in many situations avoiding to matchexec() when:
333 * - The maximum lenght of the field you are about to match() the mask to is
334 * shorter than minlen, in example when matching abc*def*ghil with a nick:
335 * It just cannot match since a nick is at most 9 chars long and the mask
336 * needs at least 10 chars (10 will be the value returned in minlen).
337 * - The charset allowed for the field you are about to match to doesn't
338 * "contain" the charset returned by matchcomp(), in example when you
339 * have *.* as mask it makes no sense to try to match it against a nick
340 * because, again, a nick can't contain a '.', you can check this with
341 * a simple (charset & NTL_IRCNK) in this case.
342 * - As a special case, since compiled masks are forced to lowercase,
343 * it would make no sense to use the NTL_LOWER and NTL_UPPER on a compiled
344 * mask, thus they are reused as follows: if the NTL_LOWER bit of charset
345 * is set it means that the mask contains only non-wilds chars (i.e. you can
346 * use strCasecmp() to match it or a direct hash lookup), if the NTL_UPPER
347 * bit is set it means that it contains only wild chars (and you can
348 * match it with strlen(field)>=minlen).
349 * Do these optimizations ONLY when the data you are about to pass to
350 * matchexec() are *known* to be invalid in advance, using strChattr()
351 * or strlen() on the text would be slower than calling matchexec() directly
353 * Internally a compiled mask contain in the *cmask area the text of
354 * the plain text form of the mask itself with applied the following hacks:
355 * - All characters are forced to lowercase (so that uppercase letters and
356 * specifically the symbols 'A' and 'Z' are reserved for special use)
357 * - All non-escaped stars '*' are replaced by the letter 'Z'
358 * - All non-escaped question marks '?' are replaced by the letter 'A'
359 * - All escape characters are removed, the wilds escaped by them are
360 * then passed by without the escape since they don't collide anymore
361 * with the real wilds (encoded as A/Z)
362 * - Finally the part of the mask that follows the last asterisk is
363 * reversed (byte order mirroring) and moved right after the first
365 * After all this a mask like: Head*CHUNK1*chu\*nK2*ch??k3*TaIl
366 * .... becomes: headZliatZchunk1Zchu*nk2ZchAAk3
367 * This can still be printed on a console, more or less understood by an
368 * human and handled with the usual str*() library functions.
369 * When you store somewhere the compiled mask you can avoid storing the
370 * textform of it since it can be "decompiled" by means of sprintmatch(),
371 * but at that time the following things are changed in the mask:
372 * - All chars have been forced to lowercase.
373 * - The mask is collapsed.
374 * The balance point of using compiled masks in terms of CPU is when you expect
375 * to use matchexec() instead of match() at least 20 times on the same mask
376 * or when you expect to use mmexec() instead of mmatch() 3 times.
382 * Compiles a mask into a form suitable for using in matchexec().
385 int matchcomp(char *cmask, int *minlen, int *charset, const char *mask)
387 const char *m = mask;
392 int l1, l2, lmin, loop, sign;
397 int chset2 = (NTL_LOWER | NTL_UPPER);
409 chset2 &= ~NTL_LOWER;
412 if ((*m == '?') || (*m == '*'))
420 chset2 &= ~NTL_LOWER;
424 chset &= NTL_char_attrib[((*b++ = toLower(ch))) - CHAR_MIN];
425 chset2 &= ~NTL_UPPER;
429 *charset = (chset | chset2);
440 for (x1 = ls + 1, x2 = (b - 1); x1 < x2; x1++, x2--)
449 while ((lmin = (l1 < l2) ? l1 : l2))
452 for (loop = 0; loop < lmin; loop++)
460 l1 -= (sign < 0) ? 0 : lmin;
461 l2 -= (sign > 0) ? 0 : lmin;
474 * Executes a match with a mask previosuly compiled with matchcomp()
475 * Note 1: If the mask isn't correctly produced by matchcomp() I will core
476 * Note 2: 'min' MUST be the value returned by matchcomp on that mask,
477 * or.... I will core even faster :-)
478 * Note 3: This piece of code is not intended to be nice but efficient.
481 int matchexec(const char *string, const char *cmask, int minlen)
483 register const char *s = string - 1;
484 register const char *b = cmask - 1;
486 register const char *bb, *bs;
490 while ((toLower(*++s) == *++b) && *s);
492 return ((*b != '\000') && ((*b++ != 'Z') || (*b != '\000')));
503 if ((trash = (s - string - minlen)) < 0)
507 while ((toLower(*--s) == *++b) && *b && (toLower(*--s) == *++b) && *b
508 && (toLower(*--s) == *++b) && *b && (toLower(*--s) == *++b) && *b);
513 return (*b != '\000');
521 while ((toLower(*++s) != ch))
527 while ((toLower(*++s) == *++b) && *b);
550 * Prints the human readable version of *cmask into *mask, (decompiles
552 * The area pointed by *mask MUST be big enough (the mask might be up to
553 * twice the size of its compiled form if it's made all of \? or \*, and
554 * this function can NOT work in place since it might enflate the mask)
555 * The printed mask is not identical to the one that was compiled to cmask,
556 * infact it is 1) forced to all lowercase, 2) collapsed, both things
557 * are supposed to NOT change it's meaning.
558 * It returns the number of chars actually written to *mask;
561 int matchdecomp(char *mask, const char *cmask)
563 register char *rtb = mask;
564 register const char *rcm = cmask;
565 register const char *begtail, *endtail;
573 for (; (*rcm != 'Z'); rcm++, rtb++)
575 if ((*rcm == '?') || (*rcm == '*'))
577 if (!((*rtb = ((*rcm == 'A') ? '?' : *rcm))))
584 while (*rcm && (*rcm != 'Z'))
609 for (rcm = endtail; (--rcm) > begtail; *rtb++ = ((*rcm == 'A') ? '?' : *rcm))
610 if ((*rcm == '?') || (*rcm == '*'))
619 * Checks if a wider compiled mask (wcm/wminlen) completely overrides
620 * a more restrict one (rcm/rminlen), basically what mmatch() does for
621 * non-compiled masks, returns 0 if the override is true (like mmatch()).
622 * "the wider overrides the restrict" means that any string that matches
623 * the restrict one _will_ also match the wider one, always.
624 * In this we behave differently from mmatch() because in example we return
625 * true for " a?*cd overrides a*bcd " for wich the override happens for how
626 * we literally defined it, here mmatch() would have returned false.
627 * The original concepts and the base alghoritm are copied from mmatch()
628 * written by Run (Carlo Wood), this function is written by
629 * Nemesi (Andrea Cocito)
632 int mmexec(const char *wcm, int wminlen, const char *rcm, int rminlen)
634 register const char *w, *r, *br, *bw, *rx, *rz;
635 register int eat, trash;
637 /* First of all rm must have enough non-stars to 'contain' wm */
638 if ((trash = rminlen - wminlen) < 0)
644 /* Let's start the game, remember that '*' is mapped to 'Z', '?'
645 is mapped to 'A' and that head?*??*?chunk*???*tail becomes
646 headAAAAZliatAAAZchunk for compiled masks */
648 /* Match the head of wm with the head of rm */
649 for (; (*r) && (*r != 'Z') && ((*w == *r) || (*w == 'A')); r++, w++);
651 while (*w == 'A') /* Eat extra '?' before '*' in wm if got '*' in rm */
653 if (*w != 'Z') /* head1<any>.. can't match head2<any>.. */
654 return ((*w) || (*r)) ? 1 : 0; /* and head<nul> matches only head<nul> */
656 return 0; /* headZ<nul> matches head<anything> */
658 /* Does rm have any stars in it ? let's check */
659 for (rx = r; *r && (*r != 'Z'); r++);
662 /* rm has no stars and thus isn't a mask but it's just a flat
663 string: special handling occurs here, note that eat must be 0 here */
668 for (; r--, (*w) && ((*w == *r) || (*w == 'A')); w++);
669 if (*w != 'Z') /* headZliat1<any> fails on head<any>2tail */
670 return (*w) ? 1 : 0; /* but headZliat<nul> matches head<any>tail */
673 /* match the chunks */
675 { /* This loop can't break but only return */
677 for (bw = w++; (*w != *rx); rx++) /* Seek the 1st char of the chunk */
678 if (--trash < 0) /* See if we can trash one more char of rm */
679 return 1; /* If not we can only fail of course */
680 for (r = ++rx, w++; (*w) && ((*w == *r) || (*w == 'A')); r++, w++);
681 if (!*w) /* Did last loop match the rest of chunk ? */
682 return 0; /* ... Yes, end of wm, matched ! */
684 { /* ... No, hitted non-star */
685 w = bw; /* Rollback at beginning of chunk */
686 if (--trash < 0) /* Trashed the char where this try started */
687 return 1; /* if we can't trash more chars fail */
691 rx = r; /* Successfully matched a chunk, move rx */
692 }; /* and go on with the next one */
696 /* rm has at least one '*' and thus is a 'real' mask */
697 rz = r++; /* rx = unused of head, rz = beg-tail */
699 /* Match the tail of wm (if any) against the tail of rm */
702 for (; (*w) && (*r != 'Z') && ((*w == *r) || (*w == 'A')); w++, r++);
703 if (*r == 'Z') /* extra '?' before tail are fluff, just flush 'em */
706 if (*w != 'Z') /* We aren't matching a chunk, can't rollback */
710 /* Match the chunks of wm against what remains of the head of rm */
714 for (bw++; (rx < rz) && (*bw != *rx); rx++) /* Seek the first */
715 if (--trash < 0) /* waste some trash reserve */
717 if (!(rx < rz)) /* head finished */
719 for (bw++, (br = ++rx);
720 (br < rz) && (*bw) && ((*bw == *br) || (*bw == 'A')); br++, bw++);
721 if (!(br < rz)) /* Note that we didn't use any 'eat' char yet, if */
722 while (*bw == 'A') /* there were eat-en chars the head would be over */
723 bw++, eat++; /* Happens only at end of head, and eat is still 0 */
730 { /* If we failed because we got the end of head */
731 trash -= (br - rx); /* it makes no sense to rollback, just trash */
732 if (--trash < 0) /* all the rest of the head wich isn't long */
733 return 1; /* enough for this chunk and go out of this */
734 break; /* loop, then we try with the chunks of rm */
746 /* Match the unused chunks of wm against the chunks of rm */
748 for (; *r && (*r != 'Z'); r++);
755 while (eat && *r) /* the '?' we had eated make us skip as many chars */
756 if (*r++ != 'Z') /* here, but can't skip stars or trailing zero */
758 for (bw++; (*r) && (*bw != *r); r++)
759 if ((*r != 'Z') && (--trash < 0))
763 for ((br = ++r), bw++;
764 (*br) && (*br != 'Z') && ((*bw == *br) || (*bw == 'A')); br++, bw++);
773 if ((!*br) || (*r == 'Z'))
774 { /* If we hit the end of rm or a star in it */
775 trash -= (br - r); /* makes no sense to rollback within this */
776 if (trash < 0) /* same chunk of br, skip it all and then */
777 return 1; /* either rollback or break this loop if */
778 if (!*br) /* it was the end of rm */
793 /* match the remaining chunks of wm against what remains of the tail of rm */
794 r = rz - eat - 1; /* can't have <nul> or 'Z'within the tail, so just move r */
798 for (bw++; (*bw != *r); r--)
803 for ((br = --r), bw++;
804 (*bw) && (br >= rx) && ((*bw == *br) || (*bw == 'A')); br--, bw++);
820 return 1; /* Auch... something left out ? Fail */
825 * Compiles an IP mask into an in_mask structure
826 * The given <mask> can either be:
827 * - An usual irc type mask, containing * and or ?
828 * - An ip number plus a /bitnumber part, that will only consider
829 * the first "bitnumber" bits of the IP (bitnumber must be in 0-31 range)
830 * - An ip numer plus a /ip.bit.mask.values that will consider
831 * only the bits marked as 1 in the ip.bit.mask.values
832 * In the last two cases both the ip number and the bitmask can specify
833 * less than 4 bytes, the missing bytes then default to zero, note that
834 * this is *different* from the way inet_aton() does and that this does
835 * NOT happen for normal IPmasks (not containing '/')
836 * If the returned value is zero the produced in_mask might match some IP,
837 * if it's nonzero it will never match anything (and the imask struct is
838 * set so that always fails).
840 * The returned structure contains 3 fields whose meaning is the following:
841 * im.mask = The bits considered significative in the IP
842 * im.bits = What these bits should look like to have a match
843 * im.fall = If zero means that the above information used as
844 * ((IP & im.mask) == im.bits) is enough to tell if the compiled
845 * mask matches the given IP, nonzero means that it is needed,
846 * in case they did match, to call also the usual text match
847 * functions, because the mask wasn't "completely compiled"
849 * They should be used like:
850 * matchcompIP(&im, mask);
851 * if ( ((IP & im.mask)!=im.bits)) || (im.fall&&match(mask,inet_ntoa(IP))) )
852 * { handle_non_match } else { handle_match };
854 * if ( match(mask, inet_ntoa(IP)) )
855 * { handle_non_match } else { handle_match };
857 * Note: This function could be smarter when dealing with complex masks,
858 * this implementation is quite lazy and understands only very simple
859 * cases, whatever contains a ? anywhere or contains a '*' that isn't
860 * part of a trailing '.*' will fallback to text-match, this could be
861 * avoided for masks like 12?3.5.6 12.*.3.4 1.*.*.2 72?72?72?72 and
862 * so on that "could" be completely compiled to IP masks.
863 * If you try to improve this be aware of the fact that ? and *
864 * could match both dots and digits and we _must_ always reject
865 * what doesn't match in textform (like leading zeros and so on),
866 * so it's a LOT more tricky than it might seem. By now most common
867 * cases are optimized.
870 int matchcompIP(struct in_mask *imask, const char *mask)
872 register const char *m = mask;
873 register unsigned int bits = 0;
874 register unsigned int filt = 0;
875 register int unco = 0;
876 register int digits = 0;
877 register int shift = 24;
878 register int tmp = 0;
885 if ((m[1] == '\\') || (m[1] == '*') || (m[1] == '?')
899 if (digits && !tmp) /* Leading zeros */
903 tmp += (*m - '0'); /* Can't overflow, INT_MAX > 2559 */
909 /* Intentional fallthrough */
911 if ((!shift) != (!*m))
913 /* Intentional fallthrough */
915 bits |= (tmp << shift);
927 if (digits && !tmp) /* Leading zeros */
931 tmp += (*m - '0'); /* Can't overflow, INT_MAX > 2559 */
941 if ((!shift) && (*m))
943 filt |= (tmp << shift);
957 if (filt && (!(shift < 16)) && (!(filt & 0xE0FFFFFF)))
958 filt = 0xFFFFFFFF << (32 - ((filt >> 24)));
963 /* Intentional fallthrough */
967 filt = (0xFFFFFFFF << (shift)) << 8;
1000 /* If we get here there is some error and this can't ever match */
1004 break; /* This time break the loop :) */
1009 imask->bits.s_addr = htonl(bits);
1010 imask->mask.s_addr = htonl(filt);
1012 return ((bits & ~filt) ? -1 : 0);