diff --git a/regex-syntax/src/lib.rs b/regex-syntax/src/lib.rs index d10b2a50ec..83260d7e03 100644 --- a/regex-syntax/src/lib.rs +++ b/regex-syntax/src/lib.rs @@ -680,7 +680,7 @@ impl CharClass { self.canonicalize() } - /// Canonicalze any sequence of ranges. + /// Canonicalize any sequence of ranges. /// /// This is responsible for enforcing the canonical format invariants /// as described on the docs for the `CharClass` type. @@ -703,6 +703,41 @@ impl CharClass { ordered } + /// Calculate the intersection of two canonical character classes. + /// + /// The returned intersection is canonical. + fn intersection(&self, other: &CharClass) -> CharClass { + if self.ranges.is_empty() || other.ranges.is_empty() { + return CharClass::empty(); + } + + let mut intersection = CharClass::empty(); + + let mut iter_a = self.ranges.iter(); + let mut iter_b = other.ranges.iter(); + let mut a = iter_a.next().unwrap(); + let mut b = iter_b.next().unwrap(); + loop { + if let Some(i) = a.intersection(&b) { + intersection.ranges.push(i); + } + + // If the range with the smaller end didn't match this time, + // it won't ever match, so move on to the next one. + let (iter, item) = if a.end < b.end { + (&mut iter_a, &mut a) + } else { + (&mut iter_b, &mut b) + }; + match iter.next() { + Some(v) => *item = v, + None => break, // no more ranges to check, done + } + } + + intersection.canonicalize() + } + /// Negates the character class. /// /// For all `c` where `c` is a Unicode scalar value, `c` matches `self` @@ -801,6 +836,18 @@ impl ClassRange { max(self.start, other.start) <= inc_char(min(self.end, other.end)) } + /// Returns the intersection of the two ranges if they have common + /// characters, `None` otherwise. + fn intersection(&self, other: &ClassRange) -> Option { + let start = max(self.start, other.start); + let end = min(self.end, other.end); + if start <= end { + Some(ClassRange::new(start, end)) + } else { + None + } + } + /// Creates a new range representing the union of `self` and `other. fn merge(self, other: ClassRange) -> ClassRange { ClassRange { @@ -1907,6 +1954,108 @@ mod tests { ])); } + #[test] + fn class_intersection_empty() { + let cls1 = class(&[]); + let cls2 = class(&[('a', 'a')]); + assert_intersection(cls1, cls2, class(&[])); + } + + #[test] + fn class_intersection_single_equal() { + let cls1 = class(&[('a', 'a')]); + let cls2 = class(&[('a', 'a')]); + assert_intersection(cls1, cls2, class(&[('a', 'a')])); + } + + #[test] + fn class_intersection_single_unequal() { + let cls1 = class(&[('a', 'a')]); + let cls2 = class(&[('b', 'b')]); + assert_intersection(cls1, cls2, class(&[])); + } + + #[test] + fn class_intersection_single_in_other() { + let cls1 = class(&[('a', 'a')]); + let cls2 = class(&[('a', 'c')]); + assert_intersection(cls1, cls2, class(&[('a', 'a')])); + } + + #[test] + fn class_intersection_range_in_other() { + let cls1 = class(&[('a', 'b')]); + let cls2 = class(&[('a', 'c')]); + assert_intersection(cls1, cls2, class(&[('a', 'b')])); + } + + #[test] + fn class_intersection_range_intersection() { + let cls1 = class(&[('a', 'b')]); + let cls2 = class(&[('b', 'c')]); + assert_intersection(cls1, cls2, class(&[('b', 'b')])); + } + + #[test] + fn class_intersection_only_adjacent() { + let cls1 = class(&[('a', 'b')]); + let cls2 = class(&[('c', 'd')]); + assert_intersection(cls1, cls2, class(&[])); + } + + #[test] + fn class_intersection_range_subset() { + let cls1 = class(&[('b', 'c')]); + let cls2 = class(&[('a', 'd')]); + assert_intersection(cls1, cls2, class(&[('b', 'c')])); + } + + #[test] + fn class_intersection_many_ranges_in_one_big() { + let cls1 = class(&[('a', 'b'), ('d', 'e'), ('g', 'h')]); + let cls2 = class(&[('a', 'h')]); + assert_intersection(cls1, cls2, class(&[ + ('a', 'b'), ('d', 'e'), ('g', 'h') + ])); + } + + #[test] + fn class_intersection_many_ranges_same() { + let cls1 = class(&[('a', 'b'), ('d', 'e'), ('g', 'h')]); + let cls2 = class(&[('a', 'b'), ('d', 'e'), ('g', 'h')]); + assert_intersection(cls1, cls2, class(&[ + ('a', 'b'), ('d', 'e'), ('g', 'h') + ])); + } + + #[test] + fn class_intersection_multiple_non_intersecting() { + let cls1 = class(&[('a', 'b'), ('g', 'h')]); + let cls2 = class(&[('d', 'e'), ('k', 'l')]); + assert_intersection(cls1, cls2, class(&[])); + } + + #[test] + fn class_intersection_non_intersecting_then_intersecting() { + let cls1 = class(&[('a', 'b'), ('d', 'e'), ('g', 'h')]); + let cls2 = class(&[('h', 'h')]); + assert_intersection(cls1, cls2, class(&[('h', 'h')])); + } + + #[test] + fn class_intersection_adjacent_alternating() { + let cls1 = class(&[('a', 'b'), ('e', 'f'), ('i', 'j')]); + let cls2 = class(&[('c', 'd'), ('g', 'h'), ('k', 'l')]); + assert_intersection(cls1, cls2, class(&[])); + } + + #[test] + fn class_intersection_overlapping_alternating() { + let cls1 = class(&[('a', 'b'), ('c', 'd'), ('e', 'f')]); + let cls2 = class(&[('b', 'c'), ('d', 'e'), ('f', 'g')]); + assert_intersection(cls1, cls2, class(&[('b', 'f')])); + } + #[test] fn class_canon_overlap_many_case_fold() { let cls = class(&[ @@ -2056,4 +2205,10 @@ mod tests { let expr = e("(?-u)[-./]"); assert_eq!("(?-u:[-\\.-/])", expr.to_string()); } + + fn assert_intersection(cls1: CharClass, cls2: CharClass, expected: CharClass) { + // intersection operation should be commutative + assert_eq!(cls1.intersection(&cls2), expected); + assert_eq!(cls2.intersection(&cls1), expected); + } } diff --git a/regex-syntax/src/parser.rs b/regex-syntax/src/parser.rs index c0c131ea2e..62ab76fb4f 100644 --- a/regex-syntax/src/parser.rs +++ b/regex-syntax/src/parser.rs @@ -86,6 +86,22 @@ enum Build { }, } +/// A type for representing the elements of a bracket stack used for parsing +/// character classes. +/// +/// This is for parsing nested character classes without recursion. +#[derive(Debug)] +enum Bracket { + /// The opening of a character class (possibly negated) + LeftBracket { + negated: bool, + }, + /// A set of characters within a character class, e.g., `a-z` + Set(CharClass), + /// An intersection operator (`&&`) + Intersection, +} + // Primary expression parsing routines. impl Parser { pub fn parse(s: &str, flags: Flags) -> Result { @@ -538,18 +554,38 @@ impl Parser { // Parses a character class, e.g., `[^a-zA-Z0-9]+`. // + // If the Unicode flag is enabled, the class is returned as a `CharClass`, + // otherwise it is converted to a `ByteClass`. + // // Start: `[` // End: `+` fn parse_class(&mut self) -> Result { - self.bump(); - self.ignore_space(); - let negated = self.bump_if('^'); - self.ignore_space(); - let mut class = CharClass::empty(); - while self.bump_if('-') { - self.ignore_space(); - class.ranges.push(ClassRange::one('-')); - } + let class = try!(self.parse_class_as_chars()); + Ok(Build::Expr(if self.flags.unicode { + Expr::Class(class) + } else { + let byte_class = class.to_byte_class(); + + // If `class` was only non-empty due to multibyte characters, the + // corresponding byte class will now be empty. + // + // See https://github.com/rust-lang/regex/issues/303 + if byte_class.is_empty() { + // e.g., (?-u)[^\x00-\xFF] + return Err(self.err(ErrorKind::EmptyClass)); + } + + Expr::ClassBytes(byte_class) + })) + } + + // Parses a character class as a `CharClass`, e.g., `[^a-zA-Z0-9]+`. + // + // Start: `[` + // End: `+` + fn parse_class_as_chars(&mut self) -> Result { + let mut bracket_stack = vec![]; + bracket_stack.extend(self.parse_open_bracket()); loop { self.ignore_space(); if self.eof() { @@ -557,50 +593,41 @@ impl Parser { return Err(self.err(ErrorKind::UnexpectedClassEof)); } match self.cur() { - // If no ranges have been added, then `]` is the first - // character (sans, perhaps, the `^` symbol), so it should - // be interpreted as a `]` instead of a closing class bracket. - ']' if class.len() > 0 => { self.bump(); break } - '[' => match self.maybe_parse_ascii() { - Some(class2) => class.ranges.extend(class2), - None => { - return Err(self.err( - ErrorKind::UnsupportedClassChar('['))); - } - }, - '\\' => match try!(self.parse_escape()) { - Build::Expr(Expr::Class(class2)) => { - class.ranges.extend(class2); - } - Build::Expr(Expr::ClassBytes(class2)) => { - for byte_range in class2 { - let s = byte_range.start as char; - let e = byte_range.end as char; - class.ranges.push(ClassRange::new(s, e)); - } - } - Build::Expr(Expr::Literal { chars, .. }) => { - try!(self.parse_class_range(&mut class, chars[0])); - } - Build::Expr(Expr::LiteralBytes { bytes, .. }) => { - let start = bytes[0] as char; - try!(self.parse_class_range(&mut class, start)); + '[' => { + if let Some(class) = self.maybe_parse_ascii() { + // e.g. `[:alnum:]` + bracket_stack.push(Bracket::Set(class)); + } else { + // nested set, e.g. `[c-d]` in `[a-b[c-d]]` + bracket_stack.extend(self.parse_open_bracket()); } - Build::Expr(e) => { - let err = ErrorKind::InvalidClassEscape(e); - return Err(self.err(err)); + } + ']' => { + self.bump(); + let class = try!(self.close_bracket(&mut bracket_stack)); + if bracket_stack.is_empty() { + // That was the outermost class, so stop now + return Ok(class); } - // Because `parse_escape` can never return `LeftParen`. - _ => unreachable!(), - }, + bracket_stack.push(Bracket::Set(class)); + } + '\\' => { + let class = try!(self.parse_class_escape()); + bracket_stack.push(Bracket::Set(class)); + } + '&' if self.peek_is("&&") => { + self.bump(); + self.bump(); + bracket_stack.push(Bracket::Intersection); + } start => { if !self.flags.unicode { let _ = try!(self.codepoint_to_one_byte(start)); } self.bump(); match start { - '&'|'~'|'-' => { - // Only report an error if we see && or ~~ or --. + '~'|'-' => { + // Only report an error if we see ~~ or --. if self.peek_is(start) { return Err(self.err( ErrorKind::UnsupportedClassChar(start))); @@ -608,50 +635,97 @@ impl Parser { } _ => {} } - try!(self.parse_class_range(&mut class, start)); + let class = try!(self.parse_class_range(start)); + bracket_stack.push(Bracket::Set(class)); } } } - class = self.class_transform(negated, class).canonicalize(); + } + + // Parses the start of a character class or a nested character class. + // That includes negation using `^` and unescaped `-` and `]` allowed at + // the start of the class. + // + // e.g., `[^a]` or `[-a]` or `[]a]` + // + // Start: `[` + // End: `a` + fn parse_open_bracket(&mut self) -> Vec { + self.bump(); + self.ignore_space(); + let negated = self.bump_if('^'); + self.ignore_space(); + + let mut class = CharClass::empty(); + while self.bump_if('-') { + class.ranges.push(ClassRange::one('-')); + self.ignore_space(); + } if class.is_empty() { - // e.g., [^\d\D] - return Err(self.err(ErrorKind::EmptyClass)); + if self.bump_if(']') { + class.ranges.push(ClassRange::one(']')); + self.ignore_space(); + } } - Ok(Build::Expr(if self.flags.unicode { - Expr::Class(class) + + let bracket = Bracket::LeftBracket { negated: negated }; + if class.is_empty() { + vec![bracket] } else { - let byte_class = class.to_byte_class(); + vec![bracket, Bracket::Set(class)] + } + } - // If `class` was only non-empty due to multibyte characters, the - // corresponding byte class will now be empty. - // - // See https://github.com/rust-lang/regex/issues/303 - if byte_class.is_empty() { - // e.g., (?-u)[^\x00-\xFF] - return Err(self.err(ErrorKind::EmptyClass)); + // Parses an escape in a character class. + // + // This is a helper for `parse_class`. Instead of returning an `Ok` value, + // it either mutates the char class or returns an error. + // + // e.g., `\wx` + // + // Start: `\` + // End: `x` + fn parse_class_escape(&mut self) -> Result { + match try!(self.parse_escape()) { + Build::Expr(Expr::Class(class)) => { + Ok(class) } - - Expr::ClassBytes(byte_class) - })) + Build::Expr(Expr::ClassBytes(class2)) => { + let mut class = CharClass::empty(); + for byte_range in class2 { + let s = byte_range.start as char; + let e = byte_range.end as char; + class.ranges.push(ClassRange::new(s, e)); + } + Ok(class) + } + Build::Expr(Expr::Literal { chars, .. }) => { + self.parse_class_range(chars[0]) + } + Build::Expr(Expr::LiteralBytes { bytes, .. }) => { + let start = bytes[0] as char; + self.parse_class_range(start) + } + Build::Expr(e) => { + let err = ErrorKind::InvalidClassEscape(e); + Err(self.err(err)) + } + // Because `parse_escape` can never return `LeftParen`. + _ => unreachable!(), + } } // Parses a single range in a character class. // - // Since this is a helper for `parse_class`, its signature sticks out. - // Namely, it requires the start character of the range and the char - // class to mutate. - // // e.g., `[a-z]` // // Start: `-` (with start == `a`) // End: `]` - fn parse_class_range(&mut self, class: &mut CharClass, start: char) - -> Result<()> { + fn parse_class_range(&mut self, start: char) -> Result { self.ignore_space(); if !self.bump_if('-') { - // Not a range, so just push a singleton range. - class.ranges.push(ClassRange::one(start)); - return Ok(()); + // Not a range, so just return a singleton range. + return Ok(CharClass::new(vec![ClassRange::one(start)])); } self.ignore_space(); if self.eof() { @@ -661,9 +735,7 @@ impl Parser { if self.peek_is(']') { // This is the end of the class, so we permit use of `-` as a // regular char (just like we do in the beginning). - class.ranges.push(ClassRange::one(start)); - class.ranges.push(ClassRange::one('-')); - return Ok(()); + return Ok(CharClass::new(vec![ClassRange::one(start), ClassRange::one('-')])); } // We have a real range. Just need to check to parse literal and @@ -700,8 +772,7 @@ impl Parser { end: end, })); } - class.ranges.push(ClassRange::new(start, end)); - Ok(()) + Ok(CharClass::new(vec![ClassRange::new(start, end)])) } // Parses an ASCII class, e.g., `[:alnum:]+`. @@ -1165,6 +1236,62 @@ impl Parser { } } +// Methods for working with the bracket stack used for character class parsing. +impl Parser { + + // After parsing a closing bracket `]`, process elements of the bracket + // stack until finding the corresponding opening bracket `[`, and return + // the combined character class. E.g. with `[^b-f&&ab-c]`: + // + // 1. Adjacent sets are merged into a single union: `ab-c` -> `a-c` + // 2. Unions separated by `&&` are intersected: `b-f` and `a-c` -> `b-c` + // 3. Negation is applied if necessary: `b-c` -> negation of `b-c` + fn close_bracket(&self, stack: &mut Vec) -> Result { + let mut union = CharClass::empty(); + let mut intersect = vec![]; + loop { + match stack.pop() { + Some(Bracket::Set(class)) => { + union.ranges.extend(class); + } + Some(Bracket::Intersection) => { + let class = self.class_union_transform(union); + intersect.push(class); + union = CharClass::empty(); + } + Some(Bracket::LeftBracket { negated }) => { + let mut class = self.class_union_transform(union); + for c in intersect { + class = class.intersection(&c); + } + // negate after combining all sets (`^` has lower precedence than `&&`) + if negated { + class = class.negate(); + } + if class.is_empty() { + // e.g., [^\d\D] + return Err(self.err(ErrorKind::EmptyClass)); + } + return Ok(class); + } + // The first element on the stack is a `LeftBracket` + None => unreachable!() + } + } + } + + // Apply case folding if requested on the union character class, and + // return a canonicalized class. + fn class_union_transform(&self, class: CharClass) -> CharClass { + if self.flags.casei { + // Case folding canonicalizes too + class.case_fold() + } else { + class.canonicalize() + } + } +} + impl Build { fn is_empty(&self) -> bool { match *self { @@ -2247,6 +2374,262 @@ mod tests { ])); } + #[test] + fn class_nested_class_union() { + assert_eq!(p(r"[c[a-b]]"), Expr::Class(class(&[('a', 'c')]))); + assert_eq!(p(r"[[a-b]]"), Expr::Class(class(&[('a', 'b')]))); + assert_eq!(p(r"[[c][a-b]]"), Expr::Class(class(&[('a', 'c')]))); + + assert_eq!(pb(r"(?-u)[c[a-b]]"), + Expr::ClassBytes(bclass(&[(b'a', b'c')]))); + assert_eq!(pb(r"(?-u)[[a-b]]"), + Expr::ClassBytes(bclass(&[(b'a', b'b')]))); + assert_eq!(pb(r"(?-u)[[c][a-b]]"), + Expr::ClassBytes(bclass(&[(b'a', b'c')]))); + } + + #[test] + fn class_nested_class_union_casei() { + assert_eq!(p(r"(?i)[c[a-b]]"), + Expr::Class(class(&[('a', 'c')]).case_fold())); + assert_eq!(p(r"(?i)[[a-b]]"), + Expr::Class(class(&[('a', 'b')]).case_fold())); + assert_eq!(p(r"(?i)[[c][a-b]]"), + Expr::Class(class(&[('a', 'c')]).case_fold())); + + assert_eq!(pb(r"(?i-u)[[\d]]"), + Expr::ClassBytes(asciid_bytes().case_fold())); + } + + #[test] + fn class_nested_class_negate() { + assert_eq!(p(r"[^[\d]]"), Expr::Class(class(PERLD).negate())); + assert_eq!(p(r"[[^\d]]"), Expr::Class(class(PERLD).negate())); + assert_eq!(p(r"[^[^\d]]"), Expr::Class(class(PERLD))); + assert_eq!(p(r"[^[\w]]"), Expr::Class(class(PERLW).negate())); + assert_eq!(p(r"[[^\w]]"), Expr::Class(class(PERLW).negate())); + assert_eq!(p(r"[^[^\w]]"), Expr::Class(class(PERLW))); + assert_eq!(p(r"[a-b[^c]]"), + Expr::Class(class(&[('\u{0}', 'b'), ('d', '\u{10FFFF}')]))); + + assert_eq!(pb(r"(?-u)[^[\d]]"), + Expr::ClassBytes(asciid_bytes().negate())); + assert_eq!(pb(r"(?-u)[[^\d]]"), + Expr::ClassBytes(asciid_bytes().negate())); + assert_eq!(pb(r"(?-u)[^[^\d]]"), + Expr::ClassBytes(asciid_bytes())); + assert_eq!(pb(r"(?-u)[^[\w]]"), + Expr::ClassBytes(asciiw_bytes().negate())); + assert_eq!(pb(r"(?-u)[[^\w]]"), + Expr::ClassBytes(asciiw_bytes().negate())); + assert_eq!(pb(r"(?-u)[^[^\w]]"), + Expr::ClassBytes(asciiw_bytes())); + assert_eq!(pb(r"(?-u)[a-b[^c]]"), + Expr::ClassBytes(bclass(&[(b'\x00', b'b'), (b'd', b'\xFF')]))) + } + + #[test] + fn class_nested_class_negate_casei() { + assert_eq!(p(r"(?i)[^[\d]]"), + Expr::Class(class(PERLD).case_fold().negate())); + assert_eq!(p(r"(?i)[[^\d]]"), + Expr::Class(class(PERLD).case_fold().negate())); + assert_eq!(p(r"(?i)[^[^\d]]"), + Expr::Class(class(PERLD).case_fold())); + assert_eq!(p(r"(?i)[^[\w]]"), + Expr::Class(class(PERLW).case_fold().negate())); + assert_eq!(p(r"(?i)[[^\w]]"), + Expr::Class(class(PERLW).case_fold().negate())); + assert_eq!(p(r"(?i)[^[^\w]]"), + Expr::Class(class(PERLW).case_fold())); + let mut cls = CharClass::empty().negate(); + cls.remove('c'); + cls.remove('C'); + assert_eq!(p(r"(?i)[a-b[^c]]"), Expr::Class(cls)); + + assert_eq!(pb(r"(?i-u)[^[\d]]"), + Expr::ClassBytes(asciid_bytes().case_fold().negate())); + assert_eq!(pb(r"(?i-u)[[^\d]]"), + Expr::ClassBytes(asciid_bytes().case_fold().negate())); + assert_eq!(pb(r"(?i-u)[^[^\d]]"), + Expr::ClassBytes(asciid_bytes().case_fold())); + assert_eq!(pb(r"(?i-u)[^[\w]]"), + Expr::ClassBytes(asciiw_bytes().case_fold().negate())); + assert_eq!(pb(r"(?i-u)[[^\w]]"), + Expr::ClassBytes(asciiw_bytes().case_fold().negate())); + assert_eq!(pb(r"(?i-u)[^[^\w]]"), + Expr::ClassBytes(asciiw_bytes().case_fold())); + let mut bytes = ByteClass::new(vec![]).negate(); + bytes.remove(b'c'); + bytes.remove(b'C'); + assert_eq!(pb(r"(?i-u)[a-b[^c]]"), Expr::ClassBytes(bytes)); + } + + #[test] + fn class_nested_class_brackets_hyphen() { + // This is confusing, but `]` is allowed if first character within a class + // It parses as a nested class with the `]` and `-` characters + assert_eq!(p(r"[[]-]]"), Expr::Class(class(&[('-', '-'), (']', ']')]))); + assert_eq!(p(r"[[\[]]"), Expr::Class(class(&[('[', '[')]))); + assert_eq!(p(r"[[\]]]"), Expr::Class(class(&[(']', ']')]))); + } + + #[test] + fn class_nested_class_deep_nesting() { + // Makes sure that implementation can handle deep nesting. + // With recursive parsing, this regex would blow the stack size. + use std::iter::repeat; + let nesting = 10_000; + let open: String = repeat("[").take(nesting).collect(); + let close: String = repeat("]").take(nesting).collect(); + let s = format!("{}a{}", open, close); + assert_eq!(p(&s), Expr::Class(class(&[('a', 'a')]))); + } + + #[test] + fn class_intersection_ranges() { + assert_eq!(p(r"[abc&&b-c]"), Expr::Class(class(&[('b', 'c')]))); + assert_eq!(p(r"[abc&&[b-c]]"), Expr::Class(class(&[('b', 'c')]))); + assert_eq!(p(r"[[abc]&&[b-c]]"), Expr::Class(class(&[('b', 'c')]))); + assert_eq!(p(r"[a-z&&b-y&&c-x]"), Expr::Class(class(&[('c', 'x')]))); + assert_eq!(p(r"[c-da-b&&a-d]"), Expr::Class(class(&[('a', 'd')]))); + assert_eq!(p(r"[a-d&&c-da-b]"), Expr::Class(class(&[('a', 'd')]))); + + assert_eq!(pb(r"(?-u)[abc&&b-c]"), + Expr::ClassBytes(bclass(&[(b'b', b'c')]))); + assert_eq!(pb(r"(?-u)[abc&&[b-c]]"), + Expr::ClassBytes(bclass(&[(b'b', b'c')]))); + assert_eq!(pb(r"(?-u)[[abc]&&[b-c]]"), + Expr::ClassBytes(bclass(&[(b'b', b'c')]))); + assert_eq!(pb(r"(?-u)[a-z&&b-y&&c-x]"), + Expr::ClassBytes(bclass(&[(b'c', b'x')]))); + assert_eq!(pb(r"(?-u)[c-da-b&&a-d]"), + Expr::ClassBytes(bclass(&[(b'a', b'd')]))); + } + + #[test] + fn class_intersection_ranges_casei() { + assert_eq!(p(r"(?i)[abc&&b-c]"), + Expr::Class(class(&[('b', 'c')]).case_fold())); + assert_eq!(p(r"(?i)[abc&&[b-c]]"), + Expr::Class(class(&[('b', 'c')]).case_fold())); + assert_eq!(p(r"(?i)[[abc]&&[b-c]]"), + Expr::Class(class(&[('b', 'c')]).case_fold())); + assert_eq!(p(r"(?i)[a-z&&b-y&&c-x]"), + Expr::Class(class(&[('c', 'x')]).case_fold())); + assert_eq!(p(r"(?i)[c-da-b&&a-d]"), + Expr::Class(class(&[('a', 'd')]).case_fold())); + + assert_eq!(pb(r"(?i-u)[abc&&b-c]"), + Expr::ClassBytes(bclass(&[(b'b', b'c')]).case_fold())); + assert_eq!(pb(r"(?i-u)[abc&&[b-c]]"), + Expr::ClassBytes(bclass(&[(b'b', b'c')]).case_fold())); + assert_eq!(pb(r"(?i-u)[[abc]&&[b-c]]"), + Expr::ClassBytes(bclass(&[(b'b', b'c')]).case_fold())); + assert_eq!(pb(r"(?i-u)[a-z&&b-y&&c-x]"), + Expr::ClassBytes(bclass(&[(b'c', b'x')]).case_fold())); + assert_eq!(pb(r"(?i-u)[c-da-b&&a-d]"), + Expr::ClassBytes(bclass(&[(b'a', b'd')]).case_fold())); + } + + #[test] + fn class_intersection_classes() { + assert_eq!(p(r"[\w&&\d]"), Expr::Class(class(PERLD))); + assert_eq!(p(r"[\w&&[[:ascii:]]]"), Expr::Class(asciiw())); + assert_eq!(p(r"[\x00-\xFF&&\pZ]"), + Expr::Class(class(&[('\u{20}', '\u{20}'), ('\u{a0}', '\u{a0}')]))); + + assert_eq!(pb(r"(?-u)[\w&&\d]"), Expr::ClassBytes(asciid_bytes())); + assert_eq!(pb(r"(?-u)[\w&&[[:ascii:]]]"), Expr::ClassBytes(asciiw_bytes())); + } + + #[test] + fn class_intersection_classes_casei() { + assert_eq!(p(r"(?i)[\w&&\d]"), Expr::Class(class(PERLD).case_fold())); + assert_eq!(p(r"(?i)[\w&&[[:ascii:]]]"), Expr::Class(asciiw().case_fold())); + assert_eq!(p(r"(?i)[\x00-\xFF&&\pZ]"), + Expr::Class(class(&[('\u{20}', '\u{20}'), ('\u{a0}', '\u{a0}')]))); + + assert_eq!(pb(r"(?i-u)[\w&&\d]"), Expr::ClassBytes(asciid_bytes().case_fold())); + assert_eq!(pb(r"(?i-u)[\w&&[[:ascii:]]]"), Expr::ClassBytes(asciiw_bytes().case_fold())); + } + + #[test] + fn class_intersection_negate() { + assert_eq!(p(r"[^\w&&\d]"), Expr::Class(class(PERLD).negate())); + assert_eq!(p(r"[^[\w&&\d]]"), Expr::Class(class(PERLD).negate())); + assert_eq!(p(r"[^[^\w&&\d]]"), Expr::Class(class(PERLD))); + assert_eq!(p(r"[\w&&[^\d]]"), + Expr::Class(class(PERLW).intersection(&class(PERLD).negate()))); + assert_eq!(p(r"[[^\w]&&[^\d]]"), + Expr::Class(class(PERLW).negate())); + + assert_eq!(pb(r"(?-u)[^\w&&\d]"), + Expr::ClassBytes(asciid_bytes().negate())); + assert_eq!(pb(r"(?-u)[^[\w&&\d]]"), + Expr::ClassBytes(asciid_bytes().negate())); + assert_eq!(pb(r"(?-u)[^[^\w&&\d]]"), + Expr::ClassBytes(asciid_bytes())); + assert_eq!(pb(r"(?-u)[\w&&[^\d]]"), + Expr::ClassBytes(asciiw().intersection(&asciid().negate()).to_byte_class())); + assert_eq!(pb(r"(?-u)[[^\w]&&[^\d]]"), + Expr::ClassBytes(asciiw_bytes().negate())); + } + + #[test] + fn class_intersection_negate_casei() { + assert_eq!(p(r"(?i)[^\w&&a-z]"), + Expr::Class(class(&[('a', 'z')]).case_fold().negate())); + assert_eq!(p(r"(?i)[^[\w&&a-z]]"), + Expr::Class(class(&[('a', 'z')]).case_fold().negate())); + assert_eq!(p(r"(?i)[^[^\w&&a-z]]"), + Expr::Class(class(&[('a', 'z')]).case_fold())); + assert_eq!(p(r"(?i)[\w&&[^a-z]]"), + Expr::Class( + class(PERLW).intersection(&class(&[('a', 'z')]) + .case_fold().negate()))); + assert_eq!(p(r"(?i)[[^\w]&&[^a-z]]"), + Expr::Class(class(PERLW).negate())); + + assert_eq!(pb(r"(?i-u)[^\w&&a-z]"), + Expr::ClassBytes(bclass(&[(b'a', b'z')]).case_fold().negate())); + assert_eq!(pb(r"(?i-u)[^[\w&&a-z]]"), + Expr::ClassBytes(bclass(&[(b'a', b'z')]).case_fold().negate())); + assert_eq!(pb(r"(?i-u)[^[^\w&&a-z]]"), + Expr::ClassBytes(bclass(&[(b'a', b'z')]).case_fold())); + assert_eq!(pb(r"(?i-u)[\w&&[^a-z]]"), + Expr::ClassBytes(bclass(&[(b'0', b'9'), (b'_', b'_')]))); + assert_eq!(pb(r"(?i-u)[[^\w]&&[^a-z]]"), + Expr::ClassBytes(asciiw_bytes().negate())); + } + + #[test] + fn class_intersection_caret() { + // In `[a^]`, `^` does not need to be escaped, so it makes sense that + // `^` is also allowed to be unescaped after `&&`. + assert_eq!(p(r"[\^&&^]"), Expr::Class(class(&[('^', '^')]))); + } + + #[test] + fn class_intersection_brackets_hyphen() { + // `]` needs to be escaped after `&&` because it is not at the start of the class. + assert_eq!(p(r"[]&&\]]"), Expr::Class(class(&[(']', ']')]))); + + assert_eq!(p(r"[-&&-]"), Expr::Class(class(&[('-', '-')]))); + } + + #[test] + fn class_intersection_ampersand() { + // Unescaped `&` after `&&` + assert_eq!(p(r"[\&&&&]"), Expr::Class(class(&[('&', '&')]))); + assert_eq!(p(r"[\&&&\&]"), Expr::Class(class(&[('&', '&')]))); + } + + #[test] + fn class_intersection_precedence() { + assert_eq!(p(r"[a-w&&[^c-g]z]"), Expr::Class(class(&[('a', 'b'), ('h', 'w')]))); + } + #[test] fn class_special_escaped_set_chars() { // These tests ensure that some special characters require escaping @@ -2876,11 +3259,33 @@ mod tests { // rejected in character classes. The intention is to use these // characters to implement sets as described in UTS#18 RL1.3. Once // that's done, these tests should be removed and replaced with others. - test_err!("[[]", 1, ErrorKind::UnsupportedClassChar('[')); - test_err!("[&&]", 2, ErrorKind::UnsupportedClassChar('&')); test_err!("[~~]", 2, ErrorKind::UnsupportedClassChar('~')); test_err!("[+--]", 4, ErrorKind::UnsupportedClassChar('-')); test_err!(r"[a-a--\xFF]", 5, ErrorKind::UnsupportedClassChar('-')); + test_err!(r"[a&&~~]", 5, ErrorKind::UnsupportedClassChar('~')); + test_err!(r"[a&&--]", 5, ErrorKind::UnsupportedClassChar('-')); + } + + #[test] + fn error_class_nested_class() { + test_err!(r"[[]]", 4, ErrorKind::UnexpectedClassEof); + test_err!(r"[[][]]", 6, ErrorKind::UnexpectedClassEof); + test_err!(r"[[^\d\D]]", 8, ErrorKind::EmptyClass); + test_err!(r"[[]", 3, ErrorKind::UnexpectedClassEof); + test_err!(r"[[^]", 4, ErrorKind::UnexpectedClassEof); + } + + #[test] + fn error_class_intersection() { + test_err!(r"[&&]", 4, ErrorKind::EmptyClass); + test_err!(r"[a&&]", 5, ErrorKind::EmptyClass); + test_err!(r"[&&&&]", 6, ErrorKind::EmptyClass); + // `]` after `&&` is not the same as in (`[]]`), because it's also not + // allowed unescaped in `[a]]`. + test_err!(r"[]&&]]", 5, ErrorKind::EmptyClass); + + let flags = Flags { allow_bytes: true, .. Flags::default() }; + test_err!(r"(?-u)[a&&\pZ]", 12, ErrorKind::UnicodeNotAllowed, flags); } #[test] diff --git a/regex-syntax/src/properties.rs b/regex-syntax/src/properties.rs index 804a90ba12..9e75f99a4e 100644 --- a/regex-syntax/src/properties.rs +++ b/regex-syntax/src/properties.rs @@ -59,6 +59,29 @@ fn valid_class_ranges() { qc(prop as fn(Vec<(char, char)>) -> bool); } +#[test] +fn intersection() { + fn prop(ranges1: Vec<(char, char)>, ranges2: Vec<(char, char)>) -> bool { + let class1 = class(&ranges1).canonicalize(); + let class2 = class(&ranges2).canonicalize(); + + let mut expected = CharClass::empty(); + // This is inefficient but correct. + for range1 in &class1 { + for range2 in &class2 { + if let Some(intersection) = range1.intersection(range2) { + expected.ranges.push(intersection); + } + } + } + expected = expected.canonicalize(); + + let got = class1.intersection(&class2); + expected == got + } + qc(prop as fn(Vec<(char, char)>, Vec<(char, char)>) -> bool); +} + /// A wrapper type for generating "regex-like" Unicode strings. /// /// In particular, this type's `Arbitrary` impl specifically biases toward diff --git a/src/lib.rs b/src/lib.rs index 96cdb45c86..128932e620 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -288,22 +288,38 @@ a separate crate, [`regex-syntax`](../regex_syntax/index.html).
 .             any character except new line (includes new line with s flag)
-[xyz]         A character class matching either x, y or z.
-[^xyz]        A character class matching any character except x, y and z.
-[a-z]         A character class matching any character in range a-z.
 \d            digit (\p{Nd})
 \D            not digit
-[[:alpha:]]   ASCII character class ([A-Za-z])
-[[:^alpha:]]  Negated ASCII character class ([^A-Za-z])
 \pN           One-letter name Unicode character class
 \p{Greek}     Unicode character class (general category or script)
 \PN           Negated one-letter name Unicode character class
 \P{Greek}     negated Unicode character class (general category or script)
 
+### Character classes + +
+[xyz]         A character class matching either x, y or z (union).
+[^xyz]        A character class matching any character except x, y and z.
+[a-z]         A character class matching any character in range a-z.
+[[:alpha:]]   ASCII character class ([A-Za-z])
+[[:^alpha:]]  Negated ASCII character class ([^A-Za-z])
+[x[^xyz]]     Nested/grouping character class (matching any character except y and z)
+[a-y&&xyz]    Intersection (matching x or y)
+[0-9&&[^4]]   Subtraction using intersection and negation (matching 0-9 except 4)
+[\[\]]        Escaping in character classes (matching [ or ])
+
+ Any named character class may appear inside a bracketed `[...]` character class. For example, `[\p{Greek}[:digit:]]` matches any Greek or ASCII -digit. +digit. `[\p{Greek}&&\pL]` matches Greek letters. + +Precedence in character classes, from most binding to least: + +1. Ranges: `a-cd` == `[a-c]d` +2. Union: `ab&&bc` == `[ab]&&[bc]` +3. Intersection: `^a-z&&b` == `^[a-z&&b]` +4. Negation ## Composites