diff --git a/Cargo.lock b/Cargo.lock index 4299b87ea2ba45004e6a37e27e688d2e52e6e8b8..6b2bc4e4f15998334b0a32ce18b34ce8ea0a566c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -26,9 +26,9 @@ checksum = "739f4a8db6605981345c5654f3a85b056ce52f37a39d34da03f25bf2151ea16e" [[package]] name = "aho-corasick" -version = "0.7.6" +version = "0.7.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "58fb5e95d83b38284460a5fda7d6470aa0b8844d283a0b614b8535e880800d2d" +checksum = "1e37cfd5e7657ada45f742d6e99ca5788580b5c529dc78faf11ece6dc702656f" dependencies = [ "memchr", ] diff --git a/third_party/rust/aho-corasick/.cargo-checksum.json b/third_party/rust/aho-corasick/.cargo-checksum.json index 499cc0d71e71bed3dd4a0f672fb3c0d56ffae77d..ec143d7f940ea819528c8d4673a1ae9f0caa6db8 100644 --- a/third_party/rust/aho-corasick/.cargo-checksum.json +++ b/third_party/rust/aho-corasick/.cargo-checksum.json @@ -1 +1 @@ -{"files":{"COPYING":"01c266bced4a434da0051174d6bee16a4c82cf634e2679b6155d40d75012390f","Cargo.toml":"a2f9a1263aa35a92af4ffc1935b264f062738bc25761aa62b3d582031d6bf5f0","DESIGN.md":"44d4516ef38d60e9638f756baf40bcd9eff1b8e8ce7538a1d8549e02d6605d48","LICENSE-MIT":"0f96a83840e146e43c0ec96a22ec1f392e0680e6c1226e6f3ba87e0740af850f","README.md":"626d74e4bdac78d2446c75c722a7e46d0eaa4e506a1068ff693b5abc338a384f","UNLICENSE":"7e12e5df4bae12cb21581ba157ced20e1986a0508dd10d0e8a4ab9a4cf94e85c","rustfmt.toml":"1ca600239a27401c4a43f363cf3f38183a212affc1f31bff3ae93234bbaec228","src/ahocorasick.rs":"46c57a83a75a8f25fdf19a15deae10748d12b8af9445ae74700a546a92024608","src/automaton.rs":"85e79ceb964f824fcceca026abd255980840116704834d70a1b9c44833df299f","src/buffer.rs":"c40992e7d1ba0bac6d1c268d41069aad81f2226686c64192ed888a60f66db8cd","src/byte_frequencies.rs":"2fb85b381c038c1e44ce94294531cdcd339dca48b1e61f41455666e802cbbc9e","src/classes.rs":"590f2e257bf7c630bea3a28d4a1f75c78db7a0802f5921aced017a056146b4e6","src/dfa.rs":"2fb1077edfefd2b7f7e9c0d9df55df1441d4571500a2c45aa5b41960a36441e4","src/error.rs":"36dbf2cefbfaa8a69186551320dbff023d3e82780a6c925e87c3e3997b967e66","src/lib.rs":"028ab998e8f0d1a98650b139bcca83681cbb52545060b9253b76d7e19117b53d","src/nfa.rs":"6bc3479ad37c576bba4bbdc9e3d0c6e69a4b7f0d9a88fcbbf727bf4a9b288494","src/packed/api.rs":"aa89627c7114c057c98ad1c7ab9ce18c6ed55267a6bcf7bc8efb917b6cfe5532","src/packed/mod.rs":"29c76ad3cbb1f831140cefac7a27fb504ac4af4f454975a571965b48aad417eb","src/packed/pattern.rs":"b88c57af057997da0a5a06f4c5604a7e598c20acfc11c15cd8977727f6e1cf9c","src/packed/rabinkarp.rs":"b3242a8631ea5607163dcbb641e4ac9c6da26774378da1e51651b0ab5656b390","src/packed/teddy/README.md":"5819f40d221af93288e705eadef5393a41d7a0900881b4d676e01fd65d5adf15","src/packed/teddy/compile.rs":"21b18cbee9bc33918b85b1dc51a0faed57acb426f61e6b72aeaf69faa7595701","src/packed/teddy/mod.rs":"f63db3419b1d378929bf0bc1f0e3b909ff3c38b9f2b6e86ba4546b8f39907cd3","src/packed/teddy/runtime.rs":"0a1250ea73159b3be6e0fa9a3f55ecedbb2cb90cb798d1709e9f5ee48f8855d5","src/packed/tests.rs":"0b52ab9eef73a1a4f141f475a9fa98e54d447104aa69acba3a7f8248ce7164b2","src/packed/vector.rs":"ab3c0535fca5f09198d58cbfae44c292aeb3ce44bc92bca36d30dc72963639fc","src/prefilter.rs":"3dbe93d85c6fb985a9aea0b5eab003fe81a228e02adba00c8f63a35c3fd246b8","src/state_id.rs":"ebecd7046760e6bd72303f288be93342b446e7fe95f20b5ce23653d802c48b09","src/tests.rs":"9201cc0662bc9a1e8fa15c59e33a18a55ec6b3bd6bbea294d9cace0053bb8d24"},"package":"58fb5e95d83b38284460a5fda7d6470aa0b8844d283a0b614b8535e880800d2d"} \ No newline at end of file +{"files":{"COPYING":"01c266bced4a434da0051174d6bee16a4c82cf634e2679b6155d40d75012390f","Cargo.toml":"f61283fd900435313b9ba8c1b87a4b5b31d442f9b554222136ec8d1d3d1e39d8","DESIGN.md":"9065f33d818d1562244d36dc4781e2a351108030cee17f11c2ba512ca7b4c27e","LICENSE-MIT":"0f96a83840e146e43c0ec96a22ec1f392e0680e6c1226e6f3ba87e0740af850f","README.md":"741e7249c8d1d6a7ba9341d68253dbf4952477c5620ff37c5325f2e894b148b6","UNLICENSE":"7e12e5df4bae12cb21581ba157ced20e1986a0508dd10d0e8a4ab9a4cf94e85c","rustfmt.toml":"1ca600239a27401c4a43f363cf3f38183a212affc1f31bff3ae93234bbaec228","src/ahocorasick.rs":"6fcbe812eec7af44b104c6b8a27b0a2ea8d67c3d9aec73cb69d802b30be5f005","src/automaton.rs":"610b3e2c104c51bf4f51a6d07626c3972e9d1274ca276e987385a231b284cc8b","src/buffer.rs":"dae7ee7c1f846ca9cf115ba4949484000e1837b4fb7311f8d8c9a35011c9c26f","src/byte_frequencies.rs":"2fb85b381c038c1e44ce94294531cdcd339dca48b1e61f41455666e802cbbc9e","src/classes.rs":"99a53a2ed8eea8c13699def90e31dfdff9d0b90572b1db3cb534e3396e7a0ed0","src/dfa.rs":"25e4455b3e179a7e192108d05f3683993456b36e3ebed99f827558c52525b7e6","src/error.rs":"d34c2c9c815df5d9dedc46b4b3ce109cd2cee07825de643f0c574ec960367beb","src/lib.rs":"f0c48b0ee093dd8b3034d025d052c3667860c5d4a196cb178588012b719acea4","src/nfa.rs":"2f443951c78196126bfd237ed5770a69077e6190daeecd47131339c25e51a3d0","src/packed/api.rs":"ec58ff1b4375dd4ff88fb5859c7ede994fe08d31b7d3677720a086592aa0fe53","src/packed/mod.rs":"d7ee11d487a7f129f16dc8f1473442a7127905933f378504bae83df0f23c5e2a","src/packed/pattern.rs":"3abf3835d4c4f8a43753c52936a894d819f713f233fc046e19de5ef95200dcce","src/packed/rabinkarp.rs":"caf9563b7442c9b75c9cb520fa236c7a6da8173705889b8d79b69ede14a20767","src/packed/teddy/README.md":"5819f40d221af93288e705eadef5393a41d7a0900881b4d676e01fd65d5adf15","src/packed/teddy/compile.rs":"aad40b3f93d2c388b409b31fb2795d414a365237789d5b1a7510d97ceb8ce260","src/packed/teddy/mod.rs":"83b52bd80272970ad17234d0db293d17c1710ec582302bf516b203c8edec037e","src/packed/teddy/runtime.rs":"836146e90b320b14fa2c65fe4af7915a41f6fb04408aac5fac731c22ff46adae","src/packed/tests.rs":"b8dc4d3281ecd6d0fa2bf7ef16cf292a467dfdce64e470c7921e983bfa60fee2","src/packed/vector.rs":"ab3c0535fca5f09198d58cbfae44c292aeb3ce44bc92bca36d30dc72963639fc","src/prefilter.rs":"82a3eb6d5c0c3f10bc8d5f57d55d6d14cf4cf21c475bb5253e1921084063b8d7","src/state_id.rs":"519ec8c7bf3fa72103d4c561c193759759f535dca924c9853efe630f406d2029","src/tests.rs":"6522ed1b244513c01de5bbcf0fe35571454fdea2c2a9d8dfe13a04bf57b70eca"},"package":"1e37cfd5e7657ada45f742d6e99ca5788580b5c529dc78faf11ece6dc702656f"} \ No newline at end of file diff --git a/third_party/rust/aho-corasick/Cargo.toml b/third_party/rust/aho-corasick/Cargo.toml index a4542e1c744c6d7c6d8d2458c47d1e338d148998..62b5f7e12909af6c9b502cae7f0aaf683bcb7a16 100644 --- a/third_party/rust/aho-corasick/Cargo.toml +++ b/third_party/rust/aho-corasick/Cargo.toml @@ -11,8 +11,9 @@ # will likely look very different (and much more reasonable) [package] +edition = "2018" name = "aho-corasick" -version = "0.7.6" +version = "0.7.18" authors = ["Andrew Gallant <jamslam@gmail.com>"] exclude = ["/aho-corasick-debug", "/ci/*", "/.travis.yml", "/appveyor.yml"] autotests = false @@ -32,16 +33,11 @@ debug = true [lib] name = "aho_corasick" [dependencies.memchr] -version = "2.2.0" +version = "2.4.0" default-features = false -[dev-dependencies.doc-comment] -version = "0.3.1" + +[dev-dependencies] [features] default = ["std"] -std = ["memchr/use_std"] -[badges.appveyor] -repository = "BurntSushi/aho-corasick" - -[badges.travis-ci] -repository = "BurntSushi/aho-corasick" +std = ["memchr/std"] diff --git a/third_party/rust/aho-corasick/DESIGN.md b/third_party/rust/aho-corasick/DESIGN.md index 8d63a0109c997d7feb06f695c68b6e80e3d0c9f2..367e203df820161d22d378e33ce34be84e7d4aa6 100644 --- a/third_party/rust/aho-corasick/DESIGN.md +++ b/third_party/rust/aho-corasick/DESIGN.md @@ -2,7 +2,7 @@ This document describes the internal design of this crate, which is an object lesson in what happens when you take a fairly simple old algorithm like Aho-Corasick and make it fast and production ready. -The target audience of this crate is Rust programmers that have some +The target audience of this document is Rust programmers that have some familiarity with string searching, however, one does not need to know the Aho-Corasick algorithm in order to read this (it is explained below). One should, however, know what a trie is. (If you don't, go read its Wikipedia @@ -13,7 +13,7 @@ own, Aho-Corasick isn't that complicated. The complex pieces come from the different variants of Aho-Corasick implemented in this crate. Specifically, they are: -* Aho-Corasick as an NFA, using dense transitions near root with sparse +* Aho-Corasick as an NFA, using dense transitions near the root with sparse transitions elsewhere. * Aho-Corasick as a DFA. (An NFA is slower to search, but cheaper to construct and uses less memory.) @@ -74,7 +74,7 @@ one is Aho-Corasick. It's a common solution because it's not too hard to implement, scales quite well even when searching for thousands of patterns and is generally pretty fast. Aho-Corasick does well here because, regardless of the number of patterns you're searching for, it always visits each byte in the -haystack exactly ocne. This means, generally speaking, adding more patterns to +haystack exactly once. This means, generally speaking, adding more patterns to an Aho-Corasick automaton does not make it slower. (Strictly speaking, however, this is not true, since a larger automaton will make less effective use of the CPU's cache.) @@ -277,12 +277,12 @@ there are a small number of patterns. # More DFA tricks -As described in the previous section, one of the downsides of using a DFA is -that is uses more memory and can take longer to builder. One small way of -mitigating these concerns is to map the alphabet used by the automaton into a -smaller space. Typically, the alphabet of a DFA has 256 elements in it: one -element for each possible value that fits into a byte. However, in many cases, -one does not need the full alphabet. For example, if all patterns in an +As described in the previous section, one of the downsides of using a DFA +is that is uses more memory and can take longer to build. One small way of +mitigating these concerns is to map the alphabet used by the automaton into +a smaller space. Typically, the alphabet of a DFA has 256 elements in it: +one element for each possible value that fits into a byte. However, in many +cases, one does not need the full alphabet. For example, if all patterns in an Aho-Corasick automaton are ASCII letters, then this only uses up 52 distinct bytes. As far as the automaton is concerned, the rest of the 204 bytes are indistinguishable from one another: they will never disrciminate between a diff --git a/third_party/rust/aho-corasick/README.md b/third_party/rust/aho-corasick/README.md index b4acbee8e3eb3d44869287ed21bbe0010293713b..cd430518e281d8cb6f4490891879ab5a019b0f62 100644 --- a/third_party/rust/aho-corasick/README.md +++ b/third_party/rust/aho-corasick/README.md @@ -5,11 +5,10 @@ acceleration in some cases. This library provides multiple pattern search principally through an implementation of the [Aho-Corasick algorithm](https://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_algorithm), which builds a finite state machine for executing searches in linear time. -Features include case insensitive matching, overlapping matches and search & -replace in streams. +Features include case insensitive matching, overlapping matches, fast searching +via SIMD and optional full DFA construction and search & replace in streams. -[](https://travis-ci.org/BurntSushi/aho-corasick) -[](https://ci.appveyor.com/project/BurntSushi/aho-corasick) +[](https://github.com/BurntSushi/aho-corasick/actions) [](https://crates.io/crates/aho-corasick) Dual-licensed under MIT or the [UNLICENSE](http://unlicense.org). @@ -29,12 +28,6 @@ Add this to your `Cargo.toml`: aho-corasick = "0.7" ``` -and this to your crate root (if you're using Rust 2015): - -```rust -extern crate aho_corasick; -``` - ### Example: basic searching @@ -95,7 +88,6 @@ loading the entire stream into memory first. ```rust use aho_corasick::AhoCorasick; -# fn example() -> Result<(), ::std::io::Error> { let patterns = &["fox", "brown", "quick"]; let replace_with = &["sloth", "grey", "slow"]; @@ -105,9 +97,9 @@ let rdr = "The quick brown fox."; let mut wtr = vec![]; let ac = AhoCorasick::new(patterns); -ac.stream_replace_all(rdr.as_bytes(), &mut wtr, replace_with)?; +ac.stream_replace_all(rdr.as_bytes(), &mut wtr, replace_with) + .expect("stream_replace_all failed"); assert_eq!(b"The slow grey sloth.".to_vec(), wtr); -# Ok(()) }; example().unwrap() ``` @@ -164,11 +156,16 @@ expression alternation. See `MatchKind` in the docs for more details. ### Minimum Rust version policy -This crate's minimum supported `rustc` version is `1.28.0`. +This crate's minimum supported `rustc` version is `1.41.1`. + +The current policy is that the minimum Rust version required to use this crate +can be increased in minor version updates. For example, if `crate 1.0` requires +Rust 1.20.0, then `crate 1.0.z` for all values of `z` will also require Rust +1.20.0 or newer. However, `crate 1.y` for `y > 0` may require a newer minimum +version of Rust. In general, this crate will be conservative with respect to the minimum -supported version of Rust. In general, it will follow the `regex` crate's -policy, since `regex` is an important dependent. +supported version of Rust. ### Future work diff --git a/third_party/rust/aho-corasick/src/ahocorasick.rs b/third_party/rust/aho-corasick/src/ahocorasick.rs index 9b7d9e7c33d47ff2bee9fe4515a5f5f431838594..2b1aa5c4c408f4342bf0d51d8240ac1958663584 100644 --- a/third_party/rust/aho-corasick/src/ahocorasick.rs +++ b/third_party/rust/aho-corasick/src/ahocorasick.rs @@ -1,14 +1,14 @@ use std::io; -use automaton::Automaton; -use buffer::Buffer; -use dfa::{self, DFA}; -use error::Result; -use nfa::{self, NFA}; -use packed; -use prefilter::PrefilterState; -use state_id::StateID; -use Match; +use crate::automaton::Automaton; +use crate::buffer::Buffer; +use crate::dfa::{self, DFA}; +use crate::error::Result; +use crate::nfa::{self, NFA}; +use crate::packed; +use crate::prefilter::{Prefilter, PrefilterState}; +use crate::state_id::StateID; +use crate::Match; /// An automaton for searching multiple strings in linear time. /// @@ -502,7 +502,7 @@ impl<S: StateID> AhoCorasick<S> { /// The closure accepts three parameters: the match found, the text of /// the match and a string buffer with which to write the replaced text /// (if any). If the closure returns `true`, then it continues to the next - /// match. If the closure returns false, then searching is stopped. + /// match. If the closure returns `false`, then searching is stopped. /// /// # Examples /// @@ -524,6 +524,24 @@ impl<S: StateID> AhoCorasick<S> { /// }); /// assert_eq!("0 the 2 to the 0age", result); /// ``` + /// + /// Stopping the replacement by returning `false` (continued from the + /// example above): + /// + /// ``` + /// # use aho_corasick::{AhoCorasickBuilder, MatchKind}; + /// # let patterns = &["append", "appendage", "app"]; + /// # let haystack = "append the app to the appendage"; + /// # let ac = AhoCorasickBuilder::new() + /// # .match_kind(MatchKind::LeftmostFirst) + /// # .build(patterns); + /// let mut result = String::new(); + /// ac.replace_all_with(haystack, &mut result, |mat, _, dst| { + /// dst.push_str(&mat.pattern().to_string()); + /// mat.pattern() != 2 + /// }); + /// assert_eq!("0 the 2 to the appendage", result); + /// ``` pub fn replace_all_with<F>( &self, haystack: &str, @@ -536,7 +554,9 @@ impl<S: StateID> AhoCorasick<S> { for mat in self.find_iter(haystack) { dst.push_str(&haystack[last_match..mat.start()]); last_match = mat.end(); - replace_with(&mat, &haystack[mat.start()..mat.end()], dst); + if !replace_with(&mat, &haystack[mat.start()..mat.end()], dst) { + break; + }; } dst.push_str(&haystack[last_match..]); } @@ -548,7 +568,7 @@ impl<S: StateID> AhoCorasick<S> { /// The closure accepts three parameters: the match found, the text of /// the match and a byte buffer with which to write the replaced text /// (if any). If the closure returns `true`, then it continues to the next - /// match. If the closure returns false, then searching is stopped. + /// match. If the closure returns `false`, then searching is stopped. /// /// # Examples /// @@ -570,6 +590,24 @@ impl<S: StateID> AhoCorasick<S> { /// }); /// assert_eq!(b"0 the 2 to the 0age".to_vec(), result); /// ``` + /// + /// Stopping the replacement by returning `false` (continued from the + /// example above): + /// + /// ``` + /// # use aho_corasick::{AhoCorasickBuilder, MatchKind}; + /// # let patterns = &["append", "appendage", "app"]; + /// # let haystack = b"append the app to the appendage"; + /// # let ac = AhoCorasickBuilder::new() + /// # .match_kind(MatchKind::LeftmostFirst) + /// # .build(patterns); + /// let mut result = vec![]; + /// ac.replace_all_with_bytes(haystack, &mut result, |mat, _, dst| { + /// dst.extend(mat.pattern().to_string().bytes()); + /// mat.pattern() != 2 + /// }); + /// assert_eq!(b"0 the 2 to the appendage".to_vec(), result); + /// ``` pub fn replace_all_with_bytes<F>( &self, haystack: &[u8], @@ -582,7 +620,9 @@ impl<S: StateID> AhoCorasick<S> { for mat in self.find_iter(haystack) { dst.extend(&haystack[last_match..mat.start()]); last_match = mat.end(); - replace_with(&mat, &haystack[mat.start()..mat.end()], dst); + if !replace_with(&mat, &haystack[mat.start()..mat.end()], dst) { + break; + }; } dst.extend(&haystack[last_match..]); } @@ -735,9 +775,7 @@ impl<S: StateID> AhoCorasick<S> { /// [`find_iter`](struct.AhoCorasick.html#method.find_iter). /// /// The closure accepts three parameters: the match found, the text of - /// the match and the writer with which to write the replaced text - /// (if any). If the closure returns `true`, then it continues to the next - /// match. If the closure returns false, then searching is stopped. + /// the match and the writer with which to write the replaced text (if any). /// /// After all matches are replaced, the writer is _not_ flushed. /// @@ -967,18 +1005,6 @@ impl<S: StateID> AhoCorasick<S> { /// /// let ac = AhoCorasickBuilder::new() /// .dfa(true) - /// .byte_classes(false) - /// .build(&["foo", "bar", "baz"]); - /// assert_eq!(20_768, ac.heap_bytes()); - /// - /// let ac = AhoCorasickBuilder::new() - /// .dfa(true) - /// .byte_classes(true) // default - /// .build(&["foo", "bar", "baz"]); - /// assert_eq!(1_248, ac.heap_bytes()); - /// - /// let ac = AhoCorasickBuilder::new() - /// .dfa(true) /// .ascii_case_insensitive(true) /// .build(&["foo", "bar", "baz"]); /// assert_eq!(1_248, ac.heap_bytes()); @@ -1037,6 +1063,24 @@ impl<S: StateID> Imp<S> { } } + /// Returns the prefilter object, if one exists, for the underlying + /// automaton. + fn prefilter(&self) -> Option<&dyn Prefilter> { + match *self { + Imp::NFA(ref nfa) => nfa.prefilter(), + Imp::DFA(ref dfa) => dfa.prefilter(), + } + } + + /// Returns true if and only if we should attempt to use a prefilter. + fn use_prefilter(&self) -> bool { + let p = match self.prefilter() { + None => return false, + Some(p) => p, + }; + !p.looks_for_non_start_of_match() + } + #[inline(always)] fn overlapping_find_at( &self, @@ -1113,7 +1157,7 @@ impl<S: StateID> Imp<S> { /// /// The lifetime `'b` refers to the lifetime of the haystack being searched. #[derive(Debug)] -pub struct FindIter<'a, 'b, S: 'a + StateID> { +pub struct FindIter<'a, 'b, S: StateID> { fsm: &'a Imp<S>, prestate: PrefilterState, haystack: &'b [u8], @@ -1170,7 +1214,7 @@ impl<'a, 'b, S: StateID> Iterator for FindIter<'a, 'b, S> { /// /// The lifetime `'b` refers to the lifetime of the haystack being searched. #[derive(Debug)] -pub struct FindOverlappingIter<'a, 'b, S: 'a + StateID> { +pub struct FindOverlappingIter<'a, 'b, S: StateID> { fsm: &'a Imp<S>, prestate: PrefilterState, haystack: &'b [u8], @@ -1241,7 +1285,7 @@ impl<'a, 'b, S: StateID> Iterator for FindOverlappingIter<'a, 'b, S> { /// /// The lifetime `'a` refers to the lifetime of the `AhoCorasick` automaton. #[derive(Debug)] -pub struct StreamFindIter<'a, R, S: 'a + StateID> { +pub struct StreamFindIter<'a, R, S: StateID> { it: StreamChunkIter<'a, R, S>, } @@ -1276,7 +1320,7 @@ impl<'a, R: io::Read, S: StateID> Iterator for StreamFindIter<'a, R, S> { /// N.B. This does not actually implement Iterator because we need to borrow /// from the underlying reader. But conceptually, it's still an iterator. #[derive(Debug)] -struct StreamChunkIter<'a, R, S: 'a + StateID> { +struct StreamChunkIter<'a, R, S: StateID> { /// The AC automaton. fsm: &'a Imp<S>, /// State associated with this automaton's prefilter. It is a heuristic @@ -1325,7 +1369,11 @@ impl<'a, R: io::Read, S: StateID> StreamChunkIter<'a, R, S> { "stream searching is only supported for Standard match semantics" ); - let prestate = PrefilterState::new(ac.max_pattern_len()); + let prestate = if ac.imp.use_prefilter() { + PrefilterState::new(ac.max_pattern_len()) + } else { + PrefilterState::disabled() + }; let buf = Buffer::new(ac.imp.max_pattern_len()); let state_id = ac.imp.start_state(); StreamChunkIter { @@ -1621,7 +1669,7 @@ impl AhoCorasickBuilder { // N.B. Using byte classes can actually be faster by improving // locality, but this only really applies for multi-megabyte // automata (i.e., automata that don't fit in your CPU's cache). - self.dfa(true).byte_classes(false); + self.dfa(true); } else if patterns.len() <= 5000 { self.dfa(true); } @@ -1809,7 +1857,7 @@ impl AhoCorasickBuilder { /// finite automaton (NFA) is used instead. /// /// The main benefit to a DFA is that it can execute searches more quickly - /// than a DFA (perhaps 2-4 times as fast). The main drawback is that the + /// than a NFA (perhaps 2-4 times as fast). The main drawback is that the /// DFA uses more space and can take much longer to build. /// /// Enabling this option does not change the time complexity for @@ -1868,6 +1916,10 @@ impl AhoCorasickBuilder { /// overall performance. /// /// This option is enabled by default. + #[deprecated( + since = "0.7.16", + note = "not carrying its weight, will be always enabled, see: https://github.com/BurntSushi/aho-corasick/issues/57" + )] pub fn byte_classes(&mut self, yes: bool) -> &mut AhoCorasickBuilder { self.dfa_builder.byte_classes(yes); self @@ -1896,6 +1948,10 @@ impl AhoCorasickBuilder { /// non-premultiplied form only requires 8 bits. /// /// This option is enabled by default. + #[deprecated( + since = "0.7.16", + note = "not carrying its weight, will be always enabled, see: https://github.com/BurntSushi/aho-corasick/issues/57" + )] pub fn premultiply(&mut self, yes: bool) -> &mut AhoCorasickBuilder { self.dfa_builder.premultiply(yes); self diff --git a/third_party/rust/aho-corasick/src/automaton.rs b/third_party/rust/aho-corasick/src/automaton.rs index 2447639c241afcf7e998aecf26f30a3ca6a3332b..b971bf3418e5aed938394b16539ceecd64d97494 100644 --- a/third_party/rust/aho-corasick/src/automaton.rs +++ b/third_party/rust/aho-corasick/src/automaton.rs @@ -1,7 +1,7 @@ -use ahocorasick::MatchKind; -use prefilter::{self, Candidate, Prefilter, PrefilterState}; -use state_id::{dead_id, fail_id, StateID}; -use Match; +use crate::ahocorasick::MatchKind; +use crate::prefilter::{self, Candidate, Prefilter, PrefilterState}; +use crate::state_id::{dead_id, fail_id, StateID}; +use crate::Match; // NOTE: This trait essentially started as a copy of the same trait from from // regex-automata, with some wording changed since we use this trait for @@ -28,6 +28,42 @@ use Match; // for tracking the state ID and one that doesn't. We should ideally do the // same for standard searching, but my sanity stopped me. +// SAFETY RATIONALE: Previously, the code below went to some length to remove +// all bounds checks. This generally produced tighter assembly and lead to +// 20-50% improvements in micro-benchmarks on corpora made up of random +// characters. This somewhat makes sense, since the branch predictor is going +// to be at its worse on random text. +// +// However, using the aho-corasick-debug tool and manually benchmarking +// different inputs, the code *with* bounds checks actually wound up being +// slightly faster: +// +// $ cat input +// Sherlock Holmes +// John Watson +// Professor Moriarty +// Irene Adler +// Mary Watson +// +// $ aho-corasick-debug-safe \ +// input OpenSubtitles2018.raw.sample.en --kind leftmost-first --dfa +// pattern read time: 32.824µs +// automaton build time: 444.687µs +// automaton heap usage: 72392 bytes +// match count: 639 +// count time: 1.809961702s +// +// $ aho-corasick-debug-master \ +// input OpenSubtitles2018.raw.sample.en --kind leftmost-first --dfa +// pattern read time: 31.425µs +// automaton build time: 317.434µs +// automaton heap usage: 72392 bytes +// match count: 639 +// count time: 2.059157705s +// +// I was able to reproduce this result on two different machines (an i5 and +// an i7). Therefore, we go the route of safe code for now. + /// A trait describing the interface of an Aho-Corasick finite state machine. /// /// Every automaton has exactly one fail state, one dead state and exactly one @@ -39,8 +75,8 @@ use Match; /// only when at least one match has been observed. /// /// Every automaton also has one or more match states, such that -/// `Automaton::is_match_state_unchecked(id)` returns `true` if and only if -/// `id` corresponds to a match state. +/// `Automaton::is_match_state(id)` returns `true` if and only if `id` +/// corresponds to a match state. pub trait Automaton { /// The representation used for state identifiers in this automaton. /// @@ -123,20 +159,12 @@ pub trait Automaton { /// must ensure that the given identifier corresponds to a valid automaton /// state. Implementors must, in turn, ensure that this routine is safe for /// all valid state identifiers and for all possible `u8` values. - unsafe fn next_state_unchecked( - &self, - current: Self::ID, - input: u8, - ) -> Self::ID; + fn next_state(&self, current: Self::ID, input: u8) -> Self::ID; - /// Like next_state_unchecked, but debug_asserts that the underlying + /// Like next_state, but debug_asserts that the underlying /// implementation never returns a `fail_id()` for the next state. - unsafe fn next_state_unchecked_no_fail( - &self, - current: Self::ID, - input: u8, - ) -> Self::ID { - let next = self.next_state_unchecked(current, input); + fn next_state_no_fail(&self, current: Self::ID, input: u8) -> Self::ID { + let next = self.next_state(current, input); // We should never see a transition to the failure state. debug_assert!( next != fail_id(), @@ -174,7 +202,7 @@ pub trait Automaton { } } - // It's important for this to always be inlined. Namely, it's only caller + // It's important for this to always be inlined. Namely, its only caller // is standard_find_at, and the inlining should remove the case analysis // for prefilter scanning when there is no prefilter available. #[inline(always)] @@ -183,66 +211,49 @@ pub trait Automaton { prestate: &mut PrefilterState, prefilter: Option<&dyn Prefilter>, haystack: &[u8], - at: usize, + mut at: usize, state_id: &mut Self::ID, ) -> Option<Match> { - // This is necessary for guaranteeing a safe API, since we use the - // state ID below in a function that exhibits UB if called with an - // invalid state ID. - assert!( - self.is_valid(*state_id), - "{} is not a valid state ID", - state_id.to_usize() - ); - unsafe { - let start = haystack.as_ptr(); - let end = haystack[haystack.len()..].as_ptr(); - let mut ptr = haystack[at..].as_ptr(); - while ptr < end { - if let Some(pre) = prefilter { - let at = ptr as usize - start as usize; - if prestate.is_effective(at) - && *state_id == self.start_state() - { - let c = prefilter::next(prestate, pre, haystack, at) - .into_option(); - match c { - None => return None, - Some(i) => { - ptr = start.offset(i as isize); - } + while at < haystack.len() { + if let Some(pre) = prefilter { + if prestate.is_effective(at) && *state_id == self.start_state() + { + let c = prefilter::next(prestate, pre, haystack, at) + .into_option(); + match c { + None => return None, + Some(i) => { + at = i; } } } - // SAFETY: next_state is safe for all possible u8 values, - // so the only thing we're concerned about is the validity - // of `state_id`. `state_id` either comes from the caller - // (in which case, we assert above that it is valid), or it - // comes from the return value of next_state, which is also - // guaranteed to be valid. - *state_id = self.next_state_unchecked_no_fail(*state_id, *ptr); - ptr = ptr.offset(1); - // This routine always quits immediately after seeing a - // match, and since dead states can only come after seeing - // a match, seeing a dead state here is impossible. (Unless - // we have an anchored automaton, in which case, dead states - // are used to stop a search.) - debug_assert!( - *state_id != dead_id() || self.anchored(), - "standard find should never see a dead state" - ); + } + // CORRECTNESS: next_state is correct for all possible u8 values, + // so the only thing we're concerned about is the validity of + // `state_id`. `state_id` either comes from the caller (in which + // case, we assume it is correct), or it comes from the return + // value of next_state, which is guaranteed to be correct. + *state_id = self.next_state_no_fail(*state_id, haystack[at]); + at += 1; + // This routine always quits immediately after seeing a + // match, and since dead states can only come after seeing + // a match, seeing a dead state here is impossible. (Unless + // we have an anchored automaton, in which case, dead states + // are used to stop a search.) + debug_assert!( + *state_id != dead_id() || self.anchored(), + "standard find should never see a dead state" + ); - if self.is_match_or_dead_state(*state_id) { - return if *state_id == dead_id() { - None - } else { - let end = ptr as usize - start as usize; - self.get_match(*state_id, 0, end) - }; - } + if self.is_match_or_dead_state(*state_id) { + return if *state_id == dead_id() { + None + } else { + self.get_match(*state_id, 0, at) + }; } - None } + None } /// Execute a search using leftmost (either first or longest) match @@ -276,7 +287,7 @@ pub trait Automaton { } } - // It's important for this to always be inlined. Namely, it's only caller + // It's important for this to always be inlined. Namely, its only caller // is leftmost_find_at, and the inlining should remove the case analysis // for prefilter scanning when there is no prefilter available. #[inline(always)] @@ -285,76 +296,58 @@ pub trait Automaton { prestate: &mut PrefilterState, prefilter: Option<&dyn Prefilter>, haystack: &[u8], - at: usize, + mut at: usize, state_id: &mut Self::ID, ) -> Option<Match> { debug_assert!(self.match_kind().is_leftmost()); - // This is necessary for guaranteeing a safe API, since we use the - // state ID below in a function that exhibits UB if called with an - // invalid state ID. - assert!( - self.is_valid(*state_id), - "{} is not a valid state ID", - state_id.to_usize() - ); if self.anchored() && at > 0 && *state_id == self.start_state() { return None; } - unsafe { - let start = haystack.as_ptr(); - let end = haystack[haystack.len()..].as_ptr(); - let mut ptr = haystack[at..].as_ptr(); - - let mut last_match = self.get_match(*state_id, 0, at); - while ptr < end { - if let Some(pre) = prefilter { - let at = ptr as usize - start as usize; - if prestate.is_effective(at) - && *state_id == self.start_state() - { - let c = prefilter::next(prestate, pre, haystack, at) - .into_option(); - match c { - None => return None, - Some(i) => { - ptr = start.offset(i as isize); - } + let mut last_match = self.get_match(*state_id, 0, at); + while at < haystack.len() { + if let Some(pre) = prefilter { + if prestate.is_effective(at) && *state_id == self.start_state() + { + let c = prefilter::next(prestate, pre, haystack, at) + .into_option(); + match c { + None => return None, + Some(i) => { + at = i; } } } - // SAFETY: next_state is safe for all possible u8 values, - // so the only thing we're concerned about is the validity - // of `state_id`. `state_id` either comes from the caller - // (in which case, we assert above that it is valid), or it - // comes from the return value of next_state, which is also - // guaranteed to be valid. - *state_id = self.next_state_unchecked_no_fail(*state_id, *ptr); - ptr = ptr.offset(1); - if self.is_match_or_dead_state(*state_id) { - if *state_id == dead_id() { - // The only way to enter into a dead state is if a - // match has been found, so we assert as much. This - // is different from normal automata, where you might - // enter a dead state if you know a subsequent match - // will never be found (regardless of whether a match - // has already been found). For Aho-Corasick, it is - // built so that we can match at any position, so the - // possibility of a match always exists. - // - // (Unless we have an anchored automaton, in which - // case, dead states are used to stop a search.) - debug_assert!( - last_match.is_some() || self.anchored(), - "failure state should only be seen after match" - ); - return last_match; - } - let end = ptr as usize - start as usize; - last_match = self.get_match(*state_id, 0, end); + } + // CORRECTNESS: next_state is correct for all possible u8 values, + // so the only thing we're concerned about is the validity of + // `state_id`. `state_id` either comes from the caller (in which + // case, we assume it is correct), or it comes from the return + // value of next_state, which is guaranteed to be correct. + *state_id = self.next_state_no_fail(*state_id, haystack[at]); + at += 1; + if self.is_match_or_dead_state(*state_id) { + if *state_id == dead_id() { + // The only way to enter into a dead state is if a match + // has been found, so we assert as much. This is different + // from normal automata, where you might enter a dead state + // if you know a subsequent match will never be found + // (regardless of whether a match has already been found). + // For Aho-Corasick, it is built so that we can match at + // any position, so the possibility of a match always + // exists. + // + // (Unless we have an anchored automaton, in which case, + // dead states are used to stop a search.) + debug_assert!( + last_match.is_some() || self.anchored(), + "failure state should only be seen after match" + ); + return last_match; } + last_match = self.get_match(*state_id, 0, at); } - last_match } + last_match } /// This is like leftmost_find_at, but does not need to track a caller @@ -393,7 +386,7 @@ pub trait Automaton { } } - // It's important for this to always be inlined. Namely, it's only caller + // It's important for this to always be inlined. Namely, its only caller // is leftmost_find_at_no_state, and the inlining should remove the case // analysis for prefilter scanning when there is no prefilter available. #[inline(always)] @@ -402,7 +395,7 @@ pub trait Automaton { prestate: &mut PrefilterState, prefilter: Option<&dyn Prefilter>, haystack: &[u8], - at: usize, + mut at: usize, ) -> Option<Match> { debug_assert!(self.match_kind().is_leftmost()); if self.anchored() && at > 0 { @@ -422,63 +415,54 @@ pub trait Automaton { }; } } - let mut state_id = self.start_state(); - unsafe { - let start = haystack.as_ptr(); - let end = haystack[haystack.len()..].as_ptr(); - let mut ptr = haystack[at..].as_ptr(); - let mut last_match = self.get_match(state_id, 0, at); - while ptr < end { - if let Some(pre) = prefilter { - let at = ptr as usize - start as usize; - if prestate.is_effective(at) - && state_id == self.start_state() - { - match prefilter::next(prestate, pre, haystack, at) { - Candidate::None => return None, - // Since we aren't tracking a state ID, we can - // quit early once we know we have a match. - Candidate::Match(m) => return Some(m), - Candidate::PossibleStartOfMatch(i) => { - ptr = start.offset(i as isize); - } + let mut state_id = self.start_state(); + let mut last_match = self.get_match(state_id, 0, at); + while at < haystack.len() { + if let Some(pre) = prefilter { + if prestate.is_effective(at) && state_id == self.start_state() + { + match prefilter::next(prestate, pre, haystack, at) { + Candidate::None => return None, + // Since we aren't tracking a state ID, we can + // quit early once we know we have a match. + Candidate::Match(m) => return Some(m), + Candidate::PossibleStartOfMatch(i) => { + at = i; } } } - // SAFETY: next_state is safe for all possible u8 values, - // so the only thing we're concerned about is the validity - // of `state_id`. `state_id` either comes from the caller - // (in which case, we assert above that it is valid), or it - // comes from the return value of next_state, which is also - // guaranteed to be valid. - state_id = self.next_state_unchecked_no_fail(state_id, *ptr); - ptr = ptr.offset(1); - if self.is_match_or_dead_state(state_id) { - if state_id == dead_id() { - // The only way to enter into a dead state is if a - // match has been found, so we assert as much. This - // is different from normal automata, where you might - // enter a dead state if you know a subsequent match - // will never be found (regardless of whether a match - // has already been found). For Aho-Corasick, it is - // built so that we can match at any position, so the - // possibility of a match always exists. - // - // (Unless we have an anchored automaton, in which - // case, dead states are used to stop a search.) - debug_assert!( - last_match.is_some() || self.anchored(), - "failure state should only be seen after match" - ); - return last_match; - } - let end = ptr as usize - start as usize; - last_match = self.get_match(state_id, 0, end); + } + // CORRECTNESS: next_state is correct for all possible u8 values, + // so the only thing we're concerned about is the validity of + // `state_id`. `state_id` either comes from the caller (in which + // case, we assume it is correct), or it comes from the return + // value of next_state, which is guaranteed to be correct. + state_id = self.next_state_no_fail(state_id, haystack[at]); + at += 1; + if self.is_match_or_dead_state(state_id) { + if state_id == dead_id() { + // The only way to enter into a dead state is if a + // match has been found, so we assert as much. This + // is different from normal automata, where you might + // enter a dead state if you know a subsequent match + // will never be found (regardless of whether a match + // has already been found). For Aho-Corasick, it is + // built so that we can match at any position, so the + // possibility of a match always exists. + // + // (Unless we have an anchored automaton, in which + // case, dead states are used to stop a search.) + debug_assert!( + last_match.is_some() || self.anchored(), + "failure state should only be seen after match" + ); + return last_match; } + last_match = self.get_match(state_id, 0, at); } - last_match } + last_match } /// Execute an overlapping search. diff --git a/third_party/rust/aho-corasick/src/buffer.rs b/third_party/rust/aho-corasick/src/buffer.rs index 01a8453739a6ffd34b2b1029d468be775c9b1da9..e7339eb2021947a6a9bdf2b7e7752533c71e5fa0 100644 --- a/third_party/rust/aho-corasick/src/buffer.rs +++ b/third_party/rust/aho-corasick/src/buffer.rs @@ -50,7 +50,9 @@ impl Buffer { // reasons, so we set a lower bound of `8 * min`. // // TODO: It would be good to find a way to test the streaming - // implementation with the minimal buffer size. + // implementation with the minimal buffer size. For now, we just + // uncomment out the next line and comment out the subsequent line. + // let capacity = 1 + min; let capacity = cmp::max(min * 8, DEFAULT_BUFFER_CAPACITY); Buffer { buf: vec![0; capacity], min, end: 0 } } @@ -117,6 +119,8 @@ impl Buffer { // SAFETY: A buffer contains Copy data, so there's no problem // moving it around. Safety also depends on our indices being in // bounds, which they always should be, given the assert above. + // + // TODO: Switch to [T]::copy_within once our MSRV is high enough. ptr::copy( self.buf[roll_start..].as_ptr(), self.buf.as_mut_ptr(), diff --git a/third_party/rust/aho-corasick/src/classes.rs b/third_party/rust/aho-corasick/src/classes.rs index fe206685fb53f6c3aea7497909172ea40903c9f9..f84ae2104ee439ac39b0cee17e914d66886e71a4 100644 --- a/third_party/rust/aho-corasick/src/classes.rs +++ b/third_party/rust/aho-corasick/src/classes.rs @@ -36,7 +36,7 @@ impl ByteClasses { pub fn get(&self, byte: u8) -> u8 { // SAFETY: This is safe because all dense transitions have // exactly 256 elements, so all u8 values are valid indices. - unsafe { *self.0.get_unchecked(byte as usize) } + self.0[byte as usize] } /// Return the total number of elements in the alphabet represented by @@ -64,7 +64,7 @@ impl ByteClasses { /// hasn't been converted to equivalence classes yet. Picking an arbitrary /// byte from each equivalence class then permits a full exploration of /// the NFA instead of using every possible byte value. - pub fn representatives(&self) -> ByteClassRepresentatives { + pub fn representatives(&self) -> ByteClassRepresentatives<'_> { ByteClassRepresentatives { classes: self, byte: 0, last_class: None } } @@ -85,7 +85,7 @@ impl ByteClasses { } impl fmt::Debug for ByteClasses { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { if self.is_singleton() { write!(f, "ByteClasses({{singletons}})") } else { diff --git a/third_party/rust/aho-corasick/src/dfa.rs b/third_party/rust/aho-corasick/src/dfa.rs index 6eee336b324e3446f4f11d93d7db9eed3d954d2f..a03a254c193432cdd05b7a4760b08f8c350bb32e 100644 --- a/third_party/rust/aho-corasick/src/dfa.rs +++ b/third_party/rust/aho-corasick/src/dfa.rs @@ -1,13 +1,13 @@ use std::mem::size_of; -use ahocorasick::MatchKind; -use automaton::Automaton; -use classes::ByteClasses; -use error::Result; -use nfa::{PatternID, PatternLength, NFA}; -use prefilter::{Prefilter, PrefilterObj, PrefilterState}; -use state_id::{dead_id, fail_id, premultiply_overflow_error, StateID}; -use Match; +use crate::ahocorasick::MatchKind; +use crate::automaton::Automaton; +use crate::classes::ByteClasses; +use crate::error::Result; +use crate::nfa::{PatternID, PatternLength, NFA}; +use crate::prefilter::{Prefilter, PrefilterObj, PrefilterState}; +use crate::state_id::{dead_id, fail_id, premultiply_overflow_error, StateID}; +use crate::Match; #[derive(Clone, Debug)] pub enum DFA<S> { @@ -43,6 +43,10 @@ impl<S: StateID> DFA<S> { self.repr().pattern_count } + pub fn prefilter(&self) -> Option<&dyn Prefilter> { + self.repr().prefilter.as_ref().map(|p| p.as_ref()) + } + pub fn start_state(&self) -> S { self.repr().start_id } @@ -189,9 +193,9 @@ impl<S: StateID> Automaton for Standard<S> { self.repr().match_count(id) } - unsafe fn next_state_unchecked(&self, current: S, input: u8) -> S { + fn next_state(&self, current: S, input: u8) -> S { let o = current.to_usize() * 256 + input as usize; - *self.repr().trans.get_unchecked(o) + self.repr().trans[o] } } @@ -248,11 +252,11 @@ impl<S: StateID> Automaton for ByteClass<S> { self.repr().match_count(id) } - unsafe fn next_state_unchecked(&self, current: S, input: u8) -> S { + fn next_state(&self, current: S, input: u8) -> S { let alphabet_len = self.repr().byte_classes.alphabet_len(); let input = self.repr().byte_classes.get(input); let o = current.to_usize() * alphabet_len + input as usize; - *self.repr().trans.get_unchecked(o) + self.repr().trans[o] } } @@ -317,9 +321,9 @@ impl<S: StateID> Automaton for Premultiplied<S> { self.repr().matches[o].len() } - unsafe fn next_state_unchecked(&self, current: S, input: u8) -> S { + fn next_state(&self, current: S, input: u8) -> S { let o = current.to_usize() + input as usize; - *self.repr().trans.get_unchecked(o) + self.repr().trans[o] } } @@ -384,10 +388,10 @@ impl<S: StateID> Automaton for PremultipliedByteClass<S> { self.repr().matches[o].len() } - unsafe fn next_state_unchecked(&self, current: S, input: u8) -> S { + fn next_state(&self, current: S, input: u8) -> S { let input = self.repr().byte_classes.get(input); let o = current.to_usize() + input as usize; - *self.repr().trans.get_unchecked(o) + self.repr().trans[o] } } @@ -637,8 +641,8 @@ impl Builder { heap_bytes: 0, prefilter: nfa.prefilter_obj().map(|p| p.clone()), byte_classes: byte_classes.clone(), - trans: trans, - matches: matches, + trans, + matches, }; for id in (0..nfa.state_len()).map(S::from_usize) { repr.matches[id.to_usize()].extend_from_slice(nfa.matches(id)); diff --git a/third_party/rust/aho-corasick/src/error.rs b/third_party/rust/aho-corasick/src/error.rs index 7dace63ca7cc5ca88d03669419ae2e8925d98a2b..a57a777945b2ccbc9a19f366c7022ba3b495e1e9 100644 --- a/third_party/rust/aho-corasick/src/error.rs +++ b/third_party/rust/aho-corasick/src/error.rs @@ -68,7 +68,7 @@ impl error::Error for Error { } impl fmt::Display for Error { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match self.kind { ErrorKind::StateIDOverflow { max } => write!( f, diff --git a/third_party/rust/aho-corasick/src/lib.rs b/third_party/rust/aho-corasick/src/lib.rs index cbaf945a1acece7e26605ec365b328a0bbe252cb..9a3d084783b4fa0cc9954a1256db8d4fd693410c 100644 --- a/third_party/rust/aho-corasick/src/lib.rs +++ b/third_party/rust/aho-corasick/src/lib.rs @@ -168,13 +168,14 @@ naive solutions, it is generally slower than more specialized algorithms that are accelerated using vector instructions such as SIMD. For that reason, this library will internally use a "prefilter" to attempt -to accelerate searches when possible. Currently, this library has fairly -limited implementation that only applies when there are 3 or fewer unique -starting bytes among all patterns in an automaton. - -While a prefilter is generally good to have on by default since it works well -in the common case, it can lead to less predictable or even sub-optimal -performance in some cases. For that reason, prefilters can be disabled via +to accelerate searches when possible. Currently, this library has several +different algorithms it might use depending on the patterns provided. Once the +number of patterns gets too big, prefilters are no longer used. + +While a prefilter is generally good to have on by default since it works +well in the common case, it can lead to less predictable or even sub-optimal +performance in some cases. For that reason, prefilters can be explicitly +disabled via [`AhoCorasickBuilder::prefilter`](struct.AhoCorasickBuilder.html#method.prefilter). */ @@ -185,20 +186,19 @@ performance in some cases. For that reason, prefilters can be disabled via #[cfg(not(feature = "std"))] compile_error!("`std` feature is currently required to build this crate"); -extern crate memchr; -#[cfg(test)] -#[macro_use] -extern crate doc_comment; +// #[cfg(doctest)] +// #[macro_use] +// extern crate doc_comment; -#[cfg(test)] -doctest!("../README.md"); +// #[cfg(doctest)] +// doctest!("../README.md"); -pub use ahocorasick::{ +pub use crate::ahocorasick::{ AhoCorasick, AhoCorasickBuilder, FindIter, FindOverlappingIter, MatchKind, StreamFindIter, }; -pub use error::{Error, ErrorKind}; -pub use state_id::StateID; +pub use crate::error::{Error, ErrorKind}; +pub use crate::state_id::StateID; mod ahocorasick; mod automaton; @@ -292,6 +292,6 @@ impl Match { #[inline] fn from_span(id: usize, start: usize, end: usize) -> Match { - Match { pattern: id, len: end - start, end: end } + Match { pattern: id, len: end - start, end } } } diff --git a/third_party/rust/aho-corasick/src/nfa.rs b/third_party/rust/aho-corasick/src/nfa.rs index 1e8fe3938694de890961214cf194f30a53c064c5..e29bb27f9674fc90243026739a69ae57208db826 100644 --- a/third_party/rust/aho-corasick/src/nfa.rs +++ b/third_party/rust/aho-corasick/src/nfa.rs @@ -4,13 +4,13 @@ use std::fmt; use std::mem::size_of; use std::ops::{Index, IndexMut}; -use ahocorasick::MatchKind; -use automaton::Automaton; -use classes::{ByteClassBuilder, ByteClasses}; -use error::Result; -use prefilter::{self, opposite_ascii_case, Prefilter, PrefilterObj}; -use state_id::{dead_id, fail_id, usize_to_state_id, StateID}; -use Match; +use crate::ahocorasick::MatchKind; +use crate::automaton::Automaton; +use crate::classes::{ByteClassBuilder, ByteClasses}; +use crate::error::Result; +use crate::prefilter::{self, opposite_ascii_case, Prefilter, PrefilterObj}; +use crate::state_id::{dead_id, fail_id, usize_to_state_id, StateID}; +use crate::Match; /// The identifier for a pattern, which is simply the position of the pattern /// in the sequence of patterns given by the caller. @@ -172,7 +172,7 @@ impl<S: StateID> NFA<S> { self.state_mut(id) } - fn iter_transitions_mut(&mut self, id: S) -> IterTransitionsMut<S> { + fn iter_transitions_mut(&mut self, id: S) -> IterTransitionsMut<'_, S> { IterTransitionsMut::new(self, id) } @@ -194,7 +194,7 @@ impl<S: StateID> NFA<S> { trans, // Anchored automatons do not have any failure transitions. fail: if self.anchored { dead_id() } else { self.start_id }, - depth: depth, + depth, matches: vec![], }); Ok(id) @@ -207,7 +207,7 @@ impl<S: StateID> NFA<S> { trans, // Anchored automatons do not have any failure transitions. fail: if self.anchored { dead_id() } else { self.start_id }, - depth: depth, + depth, matches: vec![], }); Ok(id) @@ -262,14 +262,14 @@ impl<S: StateID> Automaton for NFA<S> { self.states[id.to_usize()].matches.len() } - unsafe fn next_state_unchecked(&self, mut current: S, input: u8) -> S { + fn next_state(&self, mut current: S, input: u8) -> S { // This terminates since: // // 1. `State.fail` never points to fail_id(). // 2. All `State.fail` values point to a state closer to `start`. // 3. The start state has no transitions to fail_id(). loop { - let state = self.states.get_unchecked(current.to_usize()); + let state = &self.states[current.to_usize()]; let next = state.next_state(input); if next != fail_id() { return next; @@ -335,9 +335,9 @@ impl<S: StateID> State<S> { /// Represents the transitions for a single dense state. /// -/// The primary purpose here is to encapsulate unchecked index access. Namely, -/// since a dense representation always contains 256 elements, all values of -/// `u8` are valid indices. +/// The primary purpose here is to encapsulate index access. Namely, since a +/// dense representation always contains 256 elements, all values of `u8` are +/// valid indices. #[derive(Clone, Debug)] struct Dense<S>(Vec<S>); @@ -362,7 +362,7 @@ impl<S> Index<u8> for Dense<S> { fn index(&self, i: u8) -> &S { // SAFETY: This is safe because all dense transitions have // exactly 256 elements, so all u8 values are valid indices. - unsafe { self.0.get_unchecked(i as usize) } + &self.0[i as usize] } } @@ -371,7 +371,7 @@ impl<S> IndexMut<u8> for Dense<S> { fn index_mut(&mut self, i: u8) -> &mut S { // SAFETY: This is safe because all dense transitions have // exactly 256 elements, so all u8 values are valid indices. - unsafe { self.0.get_unchecked_mut(i as usize) } + &mut self.0[i as usize] } } @@ -497,7 +497,7 @@ impl<S: StateID> Transitions<S> { /// is iterating over transitions, the caller can still mutate the NFA. This /// is useful when creating failure transitions. #[derive(Debug)] -struct IterTransitionsMut<'a, S: StateID + 'a> { +struct IterTransitionsMut<'a, S: StateID> { nfa: &'a mut NFA<S>, state_id: S, cur: usize, @@ -619,7 +619,7 @@ struct Compiler<'a, S: StateID> { impl<'a, S: StateID> Compiler<'a, S> { fn new(builder: &'a Builder) -> Result<Compiler<'a, S>> { Ok(Compiler { - builder: builder, + builder, prefilter: prefilter::Builder::new(builder.match_kind) .ascii_case_insensitive(builder.ascii_case_insensitive), nfa: NFA { @@ -702,6 +702,10 @@ impl<'a, S: StateID> Compiler<'a, S> { // building a DFA. They would technically be useful for the // NFA, but it would require a second pass over the patterns. self.byte_classes.set_range(b, b); + if self.builder.ascii_case_insensitive { + let b = opposite_ascii_case(b); + self.byte_classes.set_range(b, b); + } // If the transition from prev using the current byte already // exists, then just move through it. Otherwise, add a new @@ -854,10 +858,17 @@ impl<'a, S: StateID> Compiler<'a, S> { while let Some(id) = queue.pop_front() { let mut it = self.nfa.iter_transitions_mut(id); while let Some((b, next)) = it.next() { - if !seen.contains(next) { - queue.push_back(next); - seen.insert(next); + if seen.contains(next) { + // The only way to visit a duplicate state in a transition + // list is when ASCII case insensitivity is enabled. In + // this case, we want to skip it since it's redundant work. + // But it would also end up duplicating matches, which + // results in reporting duplicate matches in some cases. + // See the 'acasei010' regression test. + continue; } + queue.push_back(next); + seen.insert(next); let mut fail = it.nfa().state(id).fail; while it.nfa().state(fail).next_state(b) == fail_id() { @@ -1008,10 +1019,17 @@ impl<'a, S: StateID> Compiler<'a, S> { // Queue up the next state. let next = item.next_queued_state(it.nfa(), next_id); - if !seen.contains(next.id) { - queue.push_back(next); - seen.insert(next.id); + if seen.contains(next.id) { + // The only way to visit a duplicate state in a transition + // list is when ASCII case insensitivity is enabled. In + // this case, we want to skip it since it's redundant work. + // But it would also end up duplicating matches, which + // results in reporting duplicate matches in some cases. + // See the 'acasei010' regression test. + continue; } + queue.push_back(next); + seen.insert(next.id); // Find the failure state for next. Same as standard. let mut fail = it.nfa().state(item.id).fail; @@ -1256,9 +1274,10 @@ impl Iterator for AllBytesIter { } impl<S: StateID> fmt::Debug for NFA<S> { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { writeln!(f, "NFA(")?; writeln!(f, "match_kind: {:?}", self.match_kind)?; + writeln!(f, "prefilter: {:?}", self.prefilter)?; writeln!(f, "{}", "-".repeat(79))?; for (id, s) in self.states.iter().enumerate() { let mut trans = vec![]; diff --git a/third_party/rust/aho-corasick/src/packed/api.rs b/third_party/rust/aho-corasick/src/packed/api.rs index 1740643a2c93d780f652aaaf77e01b76e49328f9..c15ae3ffa73920f88008274fb6430cc4a76b4478 100644 --- a/third_party/rust/aho-corasick/src/packed/api.rs +++ b/third_party/rust/aho-corasick/src/packed/api.rs @@ -1,9 +1,9 @@ use std::u16; -use packed::pattern::Patterns; -use packed::rabinkarp::RabinKarp; -use packed::teddy::{self, Teddy}; -use Match; +use crate::packed::pattern::Patterns; +use crate::packed::rabinkarp::RabinKarp; +use crate::packed::teddy::{self, Teddy}; +use crate::Match; /// This is a limit placed on the total number of patterns we're willing to try /// and match at once. As more sophisticated algorithms are added, this number @@ -269,8 +269,8 @@ impl Builder { }; Some(Searcher { config: self.config.clone(), - patterns: patterns, - rabinkarp: rabinkarp, + patterns, + rabinkarp, search_kind, minimum_len, }) diff --git a/third_party/rust/aho-corasick/src/packed/mod.rs b/third_party/rust/aho-corasick/src/packed/mod.rs index 5a3aa2e21abb8441f5774136812f475ef8eb2009..25a7966a058f2d0ec986b498d60748c7497b47ee 100644 --- a/third_party/rust/aho-corasick/src/packed/mod.rs +++ b/third_party/rust/aho-corasick/src/packed/mod.rs @@ -105,7 +105,7 @@ common reasons: no searcher is built. */ -pub use packed::api::{Builder, Config, FindIter, MatchKind, Searcher}; +pub use crate::packed::api::{Builder, Config, FindIter, MatchKind, Searcher}; mod api; mod pattern; diff --git a/third_party/rust/aho-corasick/src/packed/pattern.rs b/third_party/rust/aho-corasick/src/packed/pattern.rs index dfb07e98c51e9f313853d7a84c027895e450c95a..f4c6756fcb620b197eee3460c74dc40003a880f3 100644 --- a/third_party/rust/aho-corasick/src/packed/pattern.rs +++ b/third_party/rust/aho-corasick/src/packed/pattern.rs @@ -4,7 +4,7 @@ use std::mem; use std::u16; use std::usize; -use packed::api::MatchKind; +use crate::packed::api::MatchKind; /// The type used for representing a pattern identifier. /// @@ -155,7 +155,7 @@ impl Patterns { /// Return the pattern with the given identifier. If such a pattern does /// not exist, then this panics. - pub fn get(&self, id: PatternID) -> Pattern { + pub fn get(&self, id: PatternID) -> Pattern<'_> { Pattern(&self.by_id[id as usize]) } @@ -167,7 +167,7 @@ impl Patterns { /// Callers must ensure that a pattern with the given identifier exists /// before using this method. #[cfg(target_arch = "x86_64")] - pub unsafe fn get_unchecked(&self, id: PatternID) -> Pattern { + pub unsafe fn get_unchecked(&self, id: PatternID) -> Pattern<'_> { Pattern(self.by_id.get_unchecked(id as usize)) } @@ -189,7 +189,7 @@ impl Patterns { /// the order provided by this iterator, then the result is guaranteed /// to satisfy the correct match semantics. (Either leftmost-first or /// leftmost-longest.) - pub fn iter(&self) -> PatternIter { + pub fn iter(&self) -> PatternIter<'_> { PatternIter { patterns: self, i: 0 } } } @@ -226,7 +226,7 @@ impl<'p> Iterator for PatternIter<'p> { pub struct Pattern<'a>(&'a [u8]); impl<'a> fmt::Debug for Pattern<'a> { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { f.debug_struct("Pattern") .field("lit", &String::from_utf8_lossy(&self.0)) .finish() diff --git a/third_party/rust/aho-corasick/src/packed/rabinkarp.rs b/third_party/rust/aho-corasick/src/packed/rabinkarp.rs index 3992296294c741283240d457ae4894e12c118824..fa6b1e312302cc8cc58b630bc3f28f76374c4a14 100644 --- a/third_party/rust/aho-corasick/src/packed/rabinkarp.rs +++ b/third_party/rust/aho-corasick/src/packed/rabinkarp.rs @@ -1,7 +1,7 @@ use std::mem; -use packed::pattern::{PatternID, Patterns}; -use Match; +use crate::packed::pattern::{PatternID, Patterns}; +use crate::Match; /// The type of the rolling hash used in the Rabin-Karp algorithm. type Hash = usize; diff --git a/third_party/rust/aho-corasick/src/packed/teddy/compile.rs b/third_party/rust/aho-corasick/src/packed/teddy/compile.rs index bd5e971b3a919d4b7d18a81cbeef7f82a477b06d..741cb6923cfb521dab61d8bc34802f86dcb6e941 100644 --- a/third_party/rust/aho-corasick/src/packed/teddy/compile.rs +++ b/third_party/rust/aho-corasick/src/packed/teddy/compile.rs @@ -4,8 +4,8 @@ use std::cmp; use std::collections::BTreeMap; use std::fmt; -use packed::pattern::{PatternID, Patterns}; -use packed::teddy::Teddy; +use crate::packed::pattern::{PatternID, Patterns}; +use crate::packed::teddy::Teddy; /// A builder for constructing a Teddy matcher. /// @@ -73,7 +73,7 @@ impl Builder { } fn build_imp(&self, patterns: &Patterns) -> Option<Teddy> { - use packed::teddy::runtime; + use crate::packed::teddy::runtime; // Most of the logic here is just about selecting the optimal settings, // or perhaps even rejecting construction altogether. The choices @@ -119,7 +119,7 @@ impl Builder { // safe to call functions marked with the `avx2` target feature. match (masks.len(), avx, fat) { (1, false, _) => Some(Teddy { - buckets: buckets, + buckets, max_pattern_id: patterns.max_pattern_id(), exec: runtime::Exec::TeddySlim1Mask128( runtime::TeddySlim1Mask128 { @@ -128,7 +128,7 @@ impl Builder { ), }), (1, true, false) => Some(Teddy { - buckets: buckets, + buckets, max_pattern_id: patterns.max_pattern_id(), exec: runtime::Exec::TeddySlim1Mask256( runtime::TeddySlim1Mask256 { @@ -137,7 +137,7 @@ impl Builder { ), }), (1, true, true) => Some(Teddy { - buckets: buckets, + buckets, max_pattern_id: patterns.max_pattern_id(), exec: runtime::Exec::TeddyFat1Mask256( runtime::TeddyFat1Mask256 { @@ -146,7 +146,7 @@ impl Builder { ), }), (2, false, _) => Some(Teddy { - buckets: buckets, + buckets, max_pattern_id: patterns.max_pattern_id(), exec: runtime::Exec::TeddySlim2Mask128( runtime::TeddySlim2Mask128 { @@ -156,7 +156,7 @@ impl Builder { ), }), (2, true, false) => Some(Teddy { - buckets: buckets, + buckets, max_pattern_id: patterns.max_pattern_id(), exec: runtime::Exec::TeddySlim2Mask256( runtime::TeddySlim2Mask256 { @@ -166,7 +166,7 @@ impl Builder { ), }), (2, true, true) => Some(Teddy { - buckets: buckets, + buckets, max_pattern_id: patterns.max_pattern_id(), exec: runtime::Exec::TeddyFat2Mask256( runtime::TeddyFat2Mask256 { @@ -176,7 +176,7 @@ impl Builder { ), }), (3, false, _) => Some(Teddy { - buckets: buckets, + buckets, max_pattern_id: patterns.max_pattern_id(), exec: runtime::Exec::TeddySlim3Mask128( runtime::TeddySlim3Mask128 { @@ -187,7 +187,7 @@ impl Builder { ), }), (3, true, false) => Some(Teddy { - buckets: buckets, + buckets, max_pattern_id: patterns.max_pattern_id(), exec: runtime::Exec::TeddySlim3Mask256( runtime::TeddySlim3Mask256 { @@ -198,7 +198,7 @@ impl Builder { ), }), (3, true, true) => Some(Teddy { - buckets: buckets, + buckets, max_pattern_id: patterns.max_pattern_id(), exec: runtime::Exec::TeddyFat3Mask256( runtime::TeddyFat3Mask256 { @@ -296,7 +296,7 @@ impl<'p> Compiler<'p> { } impl<'p> fmt::Debug for Compiler<'p> { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { let mut buckets = vec![vec![]; self.buckets.len()]; for (i, bucket) in self.buckets.iter().enumerate() { for &patid in bucket { @@ -400,7 +400,7 @@ impl Mask { } impl fmt::Debug for Mask { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { let (mut parts_lo, mut parts_hi) = (vec![], vec![]); for i in 0..32 { parts_lo.push(format!("{:02}: {:08b}", i, self.lo[i])); diff --git a/third_party/rust/aho-corasick/src/packed/teddy/mod.rs b/third_party/rust/aho-corasick/src/packed/teddy/mod.rs index b896023a47a8ae1e86cd06c6aac50ecd37e38d2d..3268cdfad269c76d7bac381200351fb3dbcf5943 100644 --- a/third_party/rust/aho-corasick/src/packed/teddy/mod.rs +++ b/third_party/rust/aho-corasick/src/packed/teddy/mod.rs @@ -1,11 +1,11 @@ #[cfg(target_arch = "x86_64")] -pub use packed::teddy::compile::Builder; +pub use crate::packed::teddy::compile::Builder; #[cfg(not(target_arch = "x86_64"))] -pub use packed::teddy::fallback::Builder; +pub use crate::packed::teddy::fallback::Builder; #[cfg(not(target_arch = "x86_64"))] -pub use packed::teddy::fallback::Teddy; +pub use crate::packed::teddy::fallback::Teddy; #[cfg(target_arch = "x86_64")] -pub use packed::teddy::runtime::Teddy; +pub use crate::packed::teddy::runtime::Teddy; #[cfg(target_arch = "x86_64")] mod compile; @@ -14,8 +14,8 @@ mod runtime; #[cfg(not(target_arch = "x86_64"))] mod fallback { - use packed::pattern::Patterns; - use Match; + use crate::packed::pattern::Patterns; + use crate::Match; #[derive(Clone, Debug, Default)] pub struct Builder(()); diff --git a/third_party/rust/aho-corasick/src/packed/teddy/runtime.rs b/third_party/rust/aho-corasick/src/packed/teddy/runtime.rs index a73694805c27370408bd9e088d1ea3dabf7b9b0c..0d96913701d8b7623f3a197bb2fdd9fac9641858 100644 --- a/third_party/rust/aho-corasick/src/packed/teddy/runtime.rs +++ b/third_party/rust/aho-corasick/src/packed/teddy/runtime.rs @@ -51,10 +51,10 @@ use std::arch::x86_64::*; use std::mem; -use packed::pattern::{PatternID, Patterns}; -use packed::teddy::compile; -use packed::vector::*; -use Match; +use crate::packed::pattern::{PatternID, Patterns}; +use crate::packed::teddy::compile; +use crate::packed::vector::*; +use crate::Match; /// The Teddy runtime. /// diff --git a/third_party/rust/aho-corasick/src/packed/tests.rs b/third_party/rust/aho-corasick/src/packed/tests.rs index a3843966f9152eef8fe35ca21632bfd35c51d032..91410cb02c6e5a0f12875d7c960ba28771aaa5d3 100644 --- a/third_party/rust/aho-corasick/src/packed/tests.rs +++ b/third_party/rust/aho-corasick/src/packed/tests.rs @@ -1,8 +1,8 @@ use std::collections::HashMap; use std::usize; -use packed::{Config, MatchKind}; -use Match; +use crate::packed::{Config, MatchKind}; +use crate::Match; /// A description of a single test against a multi-pattern searcher. /// diff --git a/third_party/rust/aho-corasick/src/prefilter.rs b/third_party/rust/aho-corasick/src/prefilter.rs index 21b565888ef442b469e1c39bf4c78fa01433fc81..ef814117cc6db0d23ef47c0db60815c8006fa42e 100644 --- a/third_party/rust/aho-corasick/src/prefilter.rs +++ b/third_party/rust/aho-corasick/src/prefilter.rs @@ -5,9 +5,9 @@ use std::u8; use memchr::{memchr, memchr2, memchr3}; -use ahocorasick::MatchKind; -use packed; -use Match; +use crate::ahocorasick::MatchKind; +use crate::packed; +use crate::Match; /// A candidate is the result of running a prefilter on a haystack at a /// particular position. The result is either no match, a confirmed match or @@ -80,6 +80,17 @@ pub trait Prefilter: fn reports_false_positives(&self) -> bool { true } + + /// Returns true if and only if this prefilter may look for a non-starting + /// position of a match. + /// + /// This is useful in a streaming context where prefilters that don't look + /// for a starting position of a match can be quite difficult to deal with. + /// + /// This returns false by default. + fn looks_for_non_start_of_match(&self) -> bool { + false + } } impl<'a, P: Prefilter + ?Sized> Prefilter for &'a P { @@ -191,6 +202,17 @@ impl PrefilterState { } } + /// Create a prefilter state that always disables the prefilter. + pub fn disabled() -> PrefilterState { + PrefilterState { + skips: 0, + skipped: 0, + max_match_len: 0, + inert: true, + last_scan_at: 0, + } + } + /// Update this state with the number of bytes skipped on the last /// invocation of the prefilter. #[inline] @@ -285,6 +307,7 @@ impl Builder { /// All patterns added to an Aho-Corasick automaton should be added to this /// builder before attempting to construct the prefilter. pub fn build(&self) -> Option<PrefilterObj> { + // match (self.start_bytes.build(), self.rare_bytes.build()) { match (self.start_bytes.build(), self.rare_bytes.build()) { // If we could build both start and rare prefilters, then there are // a few cases in which we'd want to use the start-byte prefilter @@ -371,8 +394,14 @@ struct RareBytesBuilder { /// Whether this prefilter should account for ASCII case insensitivity or /// not. ascii_case_insensitive: bool, - /// A set of byte offsets associated with detected rare bytes. An entry is - /// only set if a rare byte is detected in a pattern. + /// A set of rare bytes, indexed by byte value. + rare_set: ByteSet, + /// A set of byte offsets associated with bytes in a pattern. An entry + /// corresponds to a particular bytes (its index) and is only non-zero if + /// the byte occurred at an offset greater than 0 in at least one pattern. + /// + /// If a byte's offset is not representable in 8 bits, then the rare bytes + /// prefilter becomes inert. byte_offsets: RareByteOffsets, /// Whether this is available as a prefilter or not. This can be set to /// false during construction if a condition is seen that invalidates the @@ -385,11 +414,43 @@ struct RareBytesBuilder { rank_sum: u16, } -/// A set of rare byte offsets, keyed by byte. +/// A set of bytes. +#[derive(Clone, Copy)] +struct ByteSet([bool; 256]); + +impl ByteSet { + fn empty() -> ByteSet { + ByteSet([false; 256]) + } + + fn insert(&mut self, b: u8) -> bool { + let new = !self.contains(b); + self.0[b as usize] = true; + new + } + + fn contains(&self, b: u8) -> bool { + self.0[b as usize] + } +} + +impl fmt::Debug for ByteSet { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let mut bytes = vec![]; + for b in 0..=255 { + if self.contains(b) { + bytes.push(b); + } + } + f.debug_struct("ByteSet").field("set", &bytes).finish() + } +} + +/// A set of byte offsets, keyed by byte. #[derive(Clone, Copy)] struct RareByteOffsets { - /// When an item in this set has an offset of u8::MAX (255), then it is - /// considered unset. + /// Each entry corresponds to the maximum offset of the corresponding + /// byte across all patterns seen. set: [RareByteOffset; 256], } @@ -403,29 +464,17 @@ impl RareByteOffsets { /// greater than the existing offset, then it overwrites the previous /// value and returns false. If there is no previous value set, then this /// sets it and returns true. - /// - /// The given offset must be active, otherwise this panics. - pub fn apply(&mut self, byte: u8, off: RareByteOffset) -> bool { - assert!(off.is_active()); - - let existing = &mut self.set[byte as usize]; - if !existing.is_active() { - *existing = off; - true - } else { - if existing.max < off.max { - *existing = off; - } - false - } + pub fn set(&mut self, byte: u8, off: RareByteOffset) { + self.set[byte as usize].max = + cmp::max(self.set[byte as usize].max, off.max); } } impl fmt::Debug for RareByteOffsets { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { let mut offsets = vec![]; for off in self.set.iter() { - if off.is_active() { + if off.max > 0 { offsets.push(off); } } @@ -448,34 +497,28 @@ struct RareByteOffset { /// ineffective when it is asked to start scanning from a position that it /// has already scanned past. /// - /// N.B. The maximum value for this is 254. A value of 255 indicates that - /// this is unused. If a rare byte is found at an offset of 255 or greater, - /// then the rare-byte prefilter is disabled for simplicity. + /// Using a `u8` here means that if we ever see a pattern that's longer + /// than 255 bytes, then the entire rare byte prefilter is disabled. max: u8, } impl Default for RareByteOffset { fn default() -> RareByteOffset { - RareByteOffset { max: u8::MAX } + RareByteOffset { max: 0 } } } impl RareByteOffset { /// Create a new rare byte offset. If the given offset is too big, then - /// an inactive `RareByteOffset` is returned. - fn new(max: usize) -> RareByteOffset { - if max > (u8::MAX - 1) as usize { - RareByteOffset::default() + /// None is returned. In that case, callers should render the rare bytes + /// prefilter inert. + fn new(max: usize) -> Option<RareByteOffset> { + if max > u8::MAX as usize { + None } else { - RareByteOffset { max: max as u8 } + Some(RareByteOffset { max: max as u8 }) } } - - /// Returns true if and only if this offset is active. If it's inactive, - /// then it should not be used. - fn is_active(&self) -> bool { - self.max < u8::MAX - } } impl RareBytesBuilder { @@ -483,6 +526,7 @@ impl RareBytesBuilder { fn new() -> RareBytesBuilder { RareBytesBuilder { ascii_case_insensitive: false, + rare_set: ByteSet::empty(), byte_offsets: RareByteOffsets::empty(), available: true, count: 0, @@ -507,8 +551,8 @@ impl RareBytesBuilder { return None; } let (mut bytes, mut len) = ([0; 3], 0); - for b in 0..256 { - if self.byte_offsets.set[b].is_active() { + for b in 0..=255 { + if self.rare_set.contains(b) { bytes[len] = b as u8; len += 1; } @@ -539,15 +583,25 @@ impl RareBytesBuilder { /// All patterns added to an Aho-Corasick automaton should be added to this /// builder before attempting to construct the prefilter. fn add(&mut self, bytes: &[u8]) { + // If we've already given up, then do nothing. + if !self.available { + return; + } // If we've already blown our budget, then don't waste time looking // for more rare bytes. if self.count > 3 { self.available = false; return; } + // If the pattern is too long, then our offset table is bunk, so + // give up. + if bytes.len() >= 256 { + self.available = false; + return; + } let mut rarest = match bytes.get(0) { None => return, - Some(&b) => (b, 0, freq_rank(b)), + Some(&b) => (b, freq_rank(b)), }; // The idea here is to look for the rarest byte in each pattern, and // add that to our set. As a special exception, if we see a byte that @@ -558,33 +612,44 @@ impl RareBytesBuilder { // were searching for `Sherlock` and `lockjaw`, then this would pick // `k` for both patterns, resulting in the use of `memchr` instead of // `memchr2` for `k` and `j`. + let mut found = false; for (pos, &b) in bytes.iter().enumerate() { - if self.byte_offsets.set[b as usize].is_active() { - self.add_rare_byte(b, pos); - return; + self.set_offset(pos, b); + if found { + continue; + } + if self.rare_set.contains(b) { + found = true; + continue; } let rank = freq_rank(b); - if rank < rarest.2 { - rarest = (b, pos, rank); + if rank < rarest.1 { + rarest = (b, rank); } } - self.add_rare_byte(rarest.0, rarest.1); + if !found { + self.add_rare_byte(rarest.0); + } } - fn add_rare_byte(&mut self, byte: u8, pos: usize) { - self.add_one_byte(byte, pos); + fn set_offset(&mut self, pos: usize, byte: u8) { + // This unwrap is OK because pos is never bigger than our max. + let offset = RareByteOffset::new(pos).unwrap(); + self.byte_offsets.set(byte, offset); if self.ascii_case_insensitive { - self.add_one_byte(opposite_ascii_case(byte), pos); + self.byte_offsets.set(opposite_ascii_case(byte), offset); } } - fn add_one_byte(&mut self, byte: u8, pos: usize) { - let off = RareByteOffset::new(pos); - if !off.is_active() { - self.available = false; - return; + fn add_rare_byte(&mut self, byte: u8) { + self.add_one_rare_byte(byte); + if self.ascii_case_insensitive { + self.add_one_rare_byte(opposite_ascii_case(byte)); } - if self.byte_offsets.apply(byte, off) { + } + + fn add_one_rare_byte(&mut self, byte: u8) { + if self.rare_set.insert(byte) { self.count += 1; self.rank_sum += freq_rank(byte) as u16; } @@ -621,6 +686,33 @@ impl Prefilter for RareBytesOne { fn heap_bytes(&self) -> usize { 0 } + + fn looks_for_non_start_of_match(&self) -> bool { + // TODO: It should be possible to use a rare byte prefilter in a + // streaming context. The main problem is that we usually assume that + // if a prefilter has scanned some text and not found anything, then no + // match *starts* in that text. This doesn't matter in non-streaming + // contexts, but in a streaming context, if we're looking for a byte + // that doesn't start at the beginning of a match and don't find it, + // then it's still possible for a match to start at the end of the + // current buffer content. In order to fix this, the streaming searcher + // would need to become aware of prefilters that do this and use the + // appropriate offset in various places. It is quite a delicate change + // and probably shouldn't be attempted until streaming search has a + // better testing strategy. In particular, we'd really like to be able + // to vary the buffer size to force strange cases that occur at the + // edge of the buffer. If we make the buffer size minimal, then these + // cases occur more frequently and easier. + // + // This is also a bummer because this means that if the prefilter + // builder chose a rare byte prefilter, then a streaming search won't + // use any prefilter at all because the builder doesn't know how it's + // going to be used. Assuming we don't make streaming search aware of + // these special types of prefilters as described above, we could fix + // this by building a "backup" prefilter that could be used when the + // rare byte prefilter could not. But that's a bandaide. Sigh. + true + } } /// A prefilter for scanning for two "rare" bytes. @@ -655,6 +747,11 @@ impl Prefilter for RareBytesTwo { fn heap_bytes(&self) -> usize { 0 } + + fn looks_for_non_start_of_match(&self) -> bool { + // TODO: See Prefilter impl for RareBytesOne. + true + } } /// A prefilter for scanning for three "rare" bytes. @@ -690,6 +787,11 @@ impl Prefilter for RareBytesThree { fn heap_bytes(&self) -> usize { 0 } + + fn looks_for_non_start_of_match(&self) -> bool { + // TODO: See Prefilter impl for RareBytesOne. + true + } } /// A builder for constructing a starting byte prefilter. @@ -698,7 +800,7 @@ impl Prefilter for RareBytesThree { /// matches by reporting all positions corresponding to a particular byte. This /// generally only takes affect when there are at most 3 distinct possible /// starting bytes. e.g., the patterns `foo`, `bar`, and `baz` have two -/// distinct starting bytes (`f` and `b`), and this prefiler returns all +/// distinct starting bytes (`f` and `b`), and this prefilter returns all /// occurrences of either `f` or `b`. /// /// In some cases, a heuristic frequency analysis may determine that it would @@ -930,7 +1032,7 @@ pub fn opposite_ascii_case(b: u8) -> u8 { /// Return the frequency rank of the given byte. The higher the rank, the more /// common the byte (heuristically speaking). fn freq_rank(b: u8) -> u8 { - use byte_frequencies::BYTE_FREQUENCIES; + use crate::byte_frequencies::BYTE_FREQUENCIES; BYTE_FREQUENCIES[b as usize] } diff --git a/third_party/rust/aho-corasick/src/state_id.rs b/third_party/rust/aho-corasick/src/state_id.rs index 8ee58c6a3cd7f437675ef2a565abde6e5e9732ab..8973806b29c77a2cb852a3b3ed2d5022a06086b4 100644 --- a/third_party/rust/aho-corasick/src/state_id.rs +++ b/third_party/rust/aho-corasick/src/state_id.rs @@ -1,7 +1,7 @@ use std::fmt::Debug; use std::hash::Hash; -use error::{Error, Result}; +use crate::error::{Error, Result}; // NOTE: Most of this code was copied from regex-automata, but without the // (de)serialization specific stuff. @@ -69,18 +69,7 @@ mod private { /// other type. In particular, this crate provides implementations for `u8`, /// `u16`, `u32`, `u64` and `usize`. (`u32` and `u64` are only provided for /// targets that can represent all corresponding values in a `usize`.) -/// -/// # Safety -/// -/// This trait is unsafe because the correctness of its implementations may be -/// relied upon by other unsafe code. For example, one possible way to -/// implement this trait incorrectly would be to return a maximum identifier -/// in `max_id` that is greater than the real maximum identifier. This will -/// likely result in wrap-on-overflow semantics in release mode, which can in -/// turn produce incorrect state identifiers. Those state identifiers may then -/// in turn access out-of-bounds memory in an automaton's search routine, where -/// bounds checks are explicitly elided for performance reasons. -pub unsafe trait StateID: +pub trait StateID: private::Sealed + Clone + Copy @@ -111,11 +100,11 @@ pub unsafe trait StateID: /// Return the maximum state identifier supported by this representation. /// /// Implementors must return a correct bound. Doing otherwise may result - /// in memory unsafety. + /// in unspecified behavior (but will not violate memory safety). fn max_id() -> usize; } -unsafe impl StateID for usize { +impl StateID for usize { #[inline] fn from_usize(n: usize) -> usize { n @@ -132,7 +121,7 @@ unsafe impl StateID for usize { } } -unsafe impl StateID for u8 { +impl StateID for u8 { #[inline] fn from_usize(n: usize) -> u8 { n as u8 @@ -149,7 +138,7 @@ unsafe impl StateID for u8 { } } -unsafe impl StateID for u16 { +impl StateID for u16 { #[inline] fn from_usize(n: usize) -> u16 { n as u16 @@ -167,7 +156,7 @@ unsafe impl StateID for u16 { } #[cfg(any(target_pointer_width = "32", target_pointer_width = "64"))] -unsafe impl StateID for u32 { +impl StateID for u32 { #[inline] fn from_usize(n: usize) -> u32 { n as u32 @@ -185,7 +174,7 @@ unsafe impl StateID for u32 { } #[cfg(target_pointer_width = "64")] -unsafe impl StateID for u64 { +impl StateID for u64 { #[inline] fn from_usize(n: usize) -> u64 { n as u64 diff --git a/third_party/rust/aho-corasick/src/tests.rs b/third_party/rust/aho-corasick/src/tests.rs index a181bac86f0661ba108ed291a50af09eb401ff6e..25c0d5f4b97695bb7d5c9f97e15597e507beb1f3 100644 --- a/third_party/rust/aho-corasick/src/tests.rs +++ b/third_party/rust/aho-corasick/src/tests.rs @@ -2,7 +2,7 @@ use std::collections::HashMap; use std::io; use std::usize; -use {AhoCorasickBuilder, Match, MatchKind}; +use crate::{AhoCorasickBuilder, Match, MatchKind}; /// A description of a single test against an Aho-Corasick automaton. /// @@ -549,6 +549,39 @@ const ANCHORED_OVERLAPPING: &'static [SearchTest] = &[ t!(aover360, &["foo", "foofoo"], "foofoo", &[(0, 0, 3), (1, 0, 6)]), ]; +/// Tests for ASCII case insensitivity. +/// +/// These tests should all have the same behavior regardless of match semantics +/// or whether the search is overlapping. +const ASCII_CASE_INSENSITIVE: &'static [SearchTest] = &[ + t!(acasei000, &["a"], "A", &[(0, 0, 1)]), + t!(acasei010, &["Samwise"], "SAMWISE", &[(0, 0, 7)]), + t!(acasei011, &["Samwise"], "SAMWISE.abcd", &[(0, 0, 7)]), + t!(acasei020, &["fOoBaR"], "quux foobar baz", &[(0, 5, 11)]), +]; + +/// Like ASCII_CASE_INSENSITIVE, but specifically for non-overlapping tests. +const ASCII_CASE_INSENSITIVE_NON_OVERLAPPING: &'static [SearchTest] = &[ + t!(acasei000, &["foo", "FOO"], "fOo", &[(0, 0, 3)]), + t!(acasei000, &["FOO", "foo"], "fOo", &[(0, 0, 3)]), + t!(acasei010, &["abc", "def"], "abcdef", &[(0, 0, 3), (1, 3, 6)]), +]; + +/// Like ASCII_CASE_INSENSITIVE, but specifically for overlapping tests. +const ASCII_CASE_INSENSITIVE_OVERLAPPING: &'static [SearchTest] = &[ + t!(acasei000, &["foo", "FOO"], "fOo", &[(0, 0, 3), (1, 0, 3)]), + t!(acasei001, &["FOO", "foo"], "fOo", &[(0, 0, 3), (1, 0, 3)]), + // This is a regression test from: + // https://github.com/BurntSushi/aho-corasick/issues/68 + // Previously, it was reporting a duplicate (1, 3, 6) match. + t!( + acasei010, + &["abc", "def", "abcdef"], + "abcdef", + &[(0, 0, 3), (2, 0, 6), (1, 3, 6)] + ), +]; + /// Regression tests that are applied to all Aho-Corasick combinations. /// /// If regression tests are needed for specific match semantics, then add them @@ -706,6 +739,8 @@ macro_rules! testcombo { $collection, $kind, |b: &mut AhoCorasickBuilder| { + // TODO: remove tests when option is removed. + #[allow(deprecated)] b.dfa(true).byte_classes(false); } ); @@ -714,6 +749,8 @@ macro_rules! testcombo { $collection, $kind, |b: &mut AhoCorasickBuilder| { + // TODO: remove tests when option is removed. + #[allow(deprecated)] b.dfa(true).premultiply(false); } ); @@ -722,6 +759,8 @@ macro_rules! testcombo { $collection, $kind, |b: &mut AhoCorasickBuilder| { + // TODO: remove tests when options are removed. + #[allow(deprecated)] b.dfa(true).byte_classes(false).premultiply(false); } ); @@ -797,6 +836,8 @@ testconfig!( AC_STANDARD_OVERLAPPING, Standard, |b: &mut AhoCorasickBuilder| { + // TODO: remove tests when option is removed. + #[allow(deprecated)] b.dfa(true).byte_classes(false); } ); @@ -806,6 +847,8 @@ testconfig!( AC_STANDARD_OVERLAPPING, Standard, |b: &mut AhoCorasickBuilder| { + // TODO: remove tests when option is removed. + #[allow(deprecated)] b.dfa(true).premultiply(false); } ); @@ -815,6 +858,8 @@ testconfig!( AC_STANDARD_OVERLAPPING, Standard, |b: &mut AhoCorasickBuilder| { + // TODO: remove tests when options are removed. + #[allow(deprecated)] b.dfa(true).byte_classes(false).premultiply(false); } ); @@ -907,6 +952,99 @@ testconfig!( } ); +// And also write out the test combinations for ASCII case insensitivity. +testconfig!( + acasei_standard_nfa_default, + &[ASCII_CASE_INSENSITIVE], + Standard, + |b: &mut AhoCorasickBuilder| { + b.prefilter(false).ascii_case_insensitive(true); + } +); +testconfig!( + acasei_standard_dfa_default, + &[ASCII_CASE_INSENSITIVE, ASCII_CASE_INSENSITIVE_NON_OVERLAPPING], + Standard, + |b: &mut AhoCorasickBuilder| { + b.ascii_case_insensitive(true).dfa(true); + } +); +testconfig!( + overlapping, + acasei_standard_overlapping_nfa_default, + &[ASCII_CASE_INSENSITIVE, ASCII_CASE_INSENSITIVE_OVERLAPPING], + Standard, + |b: &mut AhoCorasickBuilder| { + b.ascii_case_insensitive(true); + } +); +testconfig!( + overlapping, + acasei_standard_overlapping_dfa_default, + &[ASCII_CASE_INSENSITIVE, ASCII_CASE_INSENSITIVE_OVERLAPPING], + Standard, + |b: &mut AhoCorasickBuilder| { + b.ascii_case_insensitive(true).dfa(true); + } +); +testconfig!( + acasei_leftmost_first_nfa_default, + &[ASCII_CASE_INSENSITIVE, ASCII_CASE_INSENSITIVE_NON_OVERLAPPING], + LeftmostFirst, + |b: &mut AhoCorasickBuilder| { + b.ascii_case_insensitive(true); + } +); +testconfig!( + acasei_leftmost_first_dfa_default, + &[ASCII_CASE_INSENSITIVE, ASCII_CASE_INSENSITIVE_NON_OVERLAPPING], + LeftmostFirst, + |b: &mut AhoCorasickBuilder| { + b.ascii_case_insensitive(true).dfa(true); + } +); +testconfig!( + acasei_leftmost_longest_nfa_default, + &[ASCII_CASE_INSENSITIVE, ASCII_CASE_INSENSITIVE_NON_OVERLAPPING], + LeftmostLongest, + |b: &mut AhoCorasickBuilder| { + b.ascii_case_insensitive(true); + } +); +testconfig!( + acasei_leftmost_longest_dfa_default, + &[ASCII_CASE_INSENSITIVE, ASCII_CASE_INSENSITIVE_NON_OVERLAPPING], + LeftmostLongest, + |b: &mut AhoCorasickBuilder| { + b.ascii_case_insensitive(true).dfa(true); + } +); + +fn run_search_tests<F: FnMut(&SearchTest) -> Vec<Match>>( + which: TestCollection, + mut f: F, +) { + let get_match_triples = + |matches: Vec<Match>| -> Vec<(usize, usize, usize)> { + matches + .into_iter() + .map(|m| (m.pattern(), m.start(), m.end())) + .collect() + }; + for &tests in which { + for test in tests { + assert_eq!( + test.matches, + get_match_triples(f(&test)).as_slice(), + "test: {}, patterns: {:?}, haystack: {:?}", + test.name, + test.patterns, + test.haystack + ); + } + } +} + #[test] fn search_tests_have_unique_names() { let assert = |constname, tests: &[SearchTest]| { @@ -996,27 +1134,119 @@ fn regression_ascii_case_insensitive_no_exponential() { assert!(ac.find("").is_none()); } -fn run_search_tests<F: FnMut(&SearchTest) -> Vec<Match>>( - which: TestCollection, - mut f: F, -) { - let get_match_triples = - |matches: Vec<Match>| -> Vec<(usize, usize, usize)> { - matches - .into_iter() - .map(|m| (m.pattern(), m.start(), m.end())) - .collect() - }; - for &tests in which { - for test in tests { +// See: https://github.com/BurntSushi/aho-corasick/issues/53 +// +// This test ensures that the rare byte prefilter works in a particular corner +// case. In particular, the shift offset detected for '/' in the patterns below +// was incorrect, leading to a false negative. +#[test] +fn regression_rare_byte_prefilter() { + use crate::AhoCorasick; + + let ac = AhoCorasick::new_auto_configured(&["ab/j/", "x/"]); + assert!(ac.is_match("ab/j/")); +} + +#[test] +fn regression_case_insensitive_prefilter() { + use crate::AhoCorasickBuilder; + + for c in b'a'..b'z' { + for c2 in b'a'..b'z' { + let c = c as char; + let c2 = c2 as char; + let needle = format!("{}{}", c, c2).to_lowercase(); + let haystack = needle.to_uppercase(); + let ac = AhoCorasickBuilder::new() + .ascii_case_insensitive(true) + .prefilter(true) + .build(&[&needle]); assert_eq!( - test.matches, - get_match_triples(f(&test)).as_slice(), - "test: {}, patterns: {:?}, haystack: {:?}", - test.name, - test.patterns, - test.haystack + 1, + ac.find_iter(&haystack).count(), + "failed to find {:?} in {:?}\n\nautomaton:\n{:?}", + needle, + haystack, + ac, ); } } } + +// See: https://github.com/BurntSushi/aho-corasick/issues/64 +// +// This occurs when the rare byte prefilter is active. +#[test] +fn regression_stream_rare_byte_prefilter() { + use std::io::Read; + + // NOTE: The test only fails if this ends with j. + const MAGIC: [u8; 5] = *b"1234j"; + + // NOTE: The test fails for value in 8188..=8191 These value put the string + // to search accross two call to read because the buffer size is 8192 by + // default. + const BEGIN: usize = 8191; + + /// This is just a structure that implements Reader. The reader + /// implementation will simulate a file filled with 0, except for the MAGIC + /// string at offset BEGIN. + #[derive(Default)] + struct R { + read: usize, + } + + impl Read for R { + fn read(&mut self, buf: &mut [u8]) -> ::std::io::Result<usize> { + //dbg!(buf.len()); + if self.read > 100000 { + return Ok(0); + } + let mut from = 0; + if self.read < BEGIN { + from = buf.len().min(BEGIN - self.read); + for x in 0..from { + buf[x] = 0; + } + self.read += from; + } + if self.read >= BEGIN && self.read <= BEGIN + MAGIC.len() { + let to = buf.len().min(BEGIN + MAGIC.len() - self.read + from); + if to > from { + buf[from..to].copy_from_slice( + &MAGIC + [self.read - BEGIN..self.read - BEGIN + to - from], + ); + self.read += to - from; + from = to; + } + } + for x in from..buf.len() { + buf[x] = 0; + self.read += 1; + } + Ok(buf.len()) + } + } + + fn run() -> ::std::io::Result<()> { + let aut = AhoCorasickBuilder::new().build(&[&MAGIC]); + + // While reading from a vector, it works: + let mut buf = vec![]; + R::default().read_to_end(&mut buf)?; + let from_whole = aut.find_iter(&buf).next().unwrap().start(); + + //But using stream_find_iter fails! + let mut file = R::default(); + let begin = aut + .stream_find_iter(&mut file) + .next() + .expect("NOT FOUND!!!!")? // Panic here + .start(); + assert_eq!(from_whole, begin); + Ok(()) + } + + run().unwrap() +}