From 0cff65e5a86bfcacaa5ac318365d9b00c3d6ccbf Mon Sep 17 00:00:00 2001 From: RumovZ Date: Sun, 1 Nov 2020 17:24:08 +0100 Subject: [PATCH 01/20] Fix bugs and inconsistencies in the search parser --- rslib/src/search/parser.rs | 262 +++++++++++++++++++++++----------- rslib/src/search/sqlwriter.rs | 133 ++++++++--------- rslib/src/text.rs | 22 --- 3 files changed, 240 insertions(+), 177 deletions(-) diff --git a/rslib/src/search/parser.rs b/rslib/src/search/parser.rs index 6d3cd038c..44315af0d 100644 --- a/rslib/src/search/parser.rs +++ b/rslib/src/search/parser.rs @@ -9,16 +9,15 @@ use crate::{ use lazy_static::lazy_static; use nom::{ branch::alt, - bytes::complete::{escaped, is_not, tag, take_while1}, - character::complete::{anychar, char, one_of}, + bytes::complete::{escaped, is_not, tag}, + character::complete::{anychar, char, none_of, one_of}, combinator::{all_consuming, map, map_res}, - sequence::{delimited, preceded, tuple}, + sequence::{delimited, preceded, separated_pair}, {multi::many0, IResult}, }; -use regex::Regex; +use regex::{Captures, Regex}; use std::{borrow::Cow, num}; -// fixme: need to preserve \ when used twice in string struct ParseError {} @@ -42,6 +41,12 @@ impl From> for ParseError { type ParseResult = std::result::Result; +#[derive(Debug, PartialEq)] +pub(super) enum OptionalRe<'a> { + Text(Cow<'a, str>), + Re(Cow<'a, str>), +} + #[derive(Debug, PartialEq)] pub(super) enum Node<'a> { And, @@ -57,30 +62,30 @@ pub(super) enum SearchNode<'a> { UnqualifiedText(Cow<'a, str>), // foo:bar, where foo doesn't match a term below SingleField { - field: Cow<'a, str>, + field: OptionalRe<'a>, text: Cow<'a, str>, is_re: bool, }, AddedInDays(u32), EditedInDays(u32), - CardTemplate(TemplateKind), - Deck(Cow<'a, str>), + CardTemplate(TemplateKind<'a>), + Deck(String), DeckID(DeckID), NoteTypeID(NoteTypeID), - NoteType(Cow<'a, str>), + NoteType(OptionalRe<'a>), Rated { days: u32, ease: Option, }, - Tag(Cow<'a, str>), + Tag(OptionalRe<'a>), Duplicates { note_type_id: NoteTypeID, - text: String, + text: Cow<'a, str>, }, State(StateKind), Flag(u8), - NoteIDs(Cow<'a, str>), - CardIDs(Cow<'a, str>), + NoteIDs(&'a str), + CardIDs(&'a str), Property { operator: String, kind: PropertyKind, @@ -88,7 +93,7 @@ pub(super) enum SearchNode<'a> { WholeCollection, Regex(Cow<'a, str>), NoCombining(Cow<'a, str>), - WordBoundary(Cow<'a, str>), + WordBoundary(String), } #[derive(Debug, PartialEq)] @@ -113,9 +118,9 @@ pub(super) enum StateKind { } #[derive(Debug, PartialEq)] -pub(super) enum TemplateKind { +pub(super) enum TemplateKind<'a> { Ordinal(u16), - Name(String), + Name(OptionalRe<'a>), } /// Parse the input string into a list of nodes. @@ -127,7 +132,6 @@ pub(super) fn parse(input: &str) -> Result> { let (_, nodes) = all_consuming(group_inner)(input).map_err(|_e| AnkiError::SearchError(None))?; - Ok(nodes) } @@ -184,7 +188,7 @@ fn group_inner(input: &str) -> IResult<&str, Vec> { } fn whitespace0(s: &str) -> IResult<&str, Vec> { - many0(one_of(" \u{3000}"))(s) + many0(one_of(" \u{3000}\t\n"))(s) } /// Optional leading space, then a (negated) group or text @@ -205,32 +209,18 @@ fn text(s: &str) -> IResult<&str, Node> { /// Determine if text is a qualified search, and handle escaped chars. fn search_node_for_text(s: &str) -> ParseResult { - let mut it = s.splitn(2, ':'); - let (head, tail) = ( - unescape_quotes(it.next().unwrap()), - it.next().map(unescape_quotes), - ); - - if let Some(tail) = tail { - search_node_for_text_with_argument(head, tail) + let (tail, head) = escaped(is_not(r":\"), '\\', anychar)(s)?; + if tail.is_empty() { + Ok(SearchNode::UnqualifiedText(unescape_to_glob(head)?)) } else { - Ok(SearchNode::UnqualifiedText(head)) + search_node_for_text_with_argument(head, &tail[1..]) } } -/// \" -> " -fn unescape_quotes(s: &str) -> Cow { - if s.find(r#"\""#).is_some() { - s.replace(r#"\""#, "\"").into() - } else { - s.into() - } -} - -/// Unquoted text, terminated by a space or ) +/// Unquoted text, terminated by whitespace or unescaped ", ( or ) fn unquoted_term(s: &str) -> IResult<&str, Node> { map_res( - take_while1(|c| c != ' ' && c != ')' && c != '"'), + escaped(is_not("\"() \u{3000}\\"), '\\', none_of(" \u{3000}")), |text: &str| -> ParseResult { Ok(if text.eq_ignore_ascii_case("or") { Node::Or @@ -261,52 +251,64 @@ fn quoted_term_inner(s: &str) -> IResult<&str, &str> { /// eg deck:"foo bar" - quotes must come after the : fn partially_quoted_term(s: &str) -> IResult<&str, Node> { - let term = take_while1(|c| c != ' ' && c != ')' && c != ':'); - let (s, (term, _, quoted_val)) = tuple((term, char(':'), quoted_term_str))(s)?; - let quoted_val = unescape_quotes(quoted_val); - - match search_node_for_text_with_argument(term.into(), quoted_val) { - Ok(search) => Ok((s, Node::Search(search))), - Err(_) => Err(nom::Err::Failure((s, nom::error::ErrorKind::NoneOf))), - } + map_res( + separated_pair( + escaped(is_not("\"(): \u{3000}\\"), '\\', none_of(": \u{3000}")), + char(':'), + quoted_term_str, + ), + |p| match search_node_for_text_with_argument(p.0, p.1) { + Ok(search) => Ok(Node::Search(search)), + Err(e) => Err(e), + }, + )(s) } /// Convert a colon-separated key/val pair into the relevant search type. fn search_node_for_text_with_argument<'a>( - key: Cow<'a, str>, - val: Cow<'a, str>, + key: &'a str, + val: &'a str, ) -> ParseResult> { Ok(match key.to_ascii_lowercase().as_str() { "added" => SearchNode::AddedInDays(val.parse()?), "edited" => SearchNode::EditedInDays(val.parse()?), - "deck" => SearchNode::Deck(val), - "note" => SearchNode::NoteType(val), - "tag" => SearchNode::Tag(val), + "deck" => SearchNode::Deck(unescape_to_enforced_re(val)?), + "note" => SearchNode::NoteType(unescape_to_re(val)?), + "tag" => SearchNode::Tag(parse_tag(val)?), "mid" => SearchNode::NoteTypeID(val.parse()?), "nid" => SearchNode::NoteIDs(check_id_list(val)?), "cid" => SearchNode::CardIDs(check_id_list(val)?), "did" => SearchNode::DeckID(val.parse()?), - "card" => parse_template(val.as_ref()), - "is" => parse_state(val.as_ref())?, - "flag" => parse_flag(val.as_ref())?, - "rated" => parse_rated(val.as_ref())?, - "dupe" => parse_dupes(val.as_ref())?, - "prop" => parse_prop(val.as_ref())?, - "re" => SearchNode::Regex(val), - "nc" => SearchNode::NoCombining(val), - "w" => SearchNode::WordBoundary(val), + "card" => parse_template(val)?, + "is" => parse_state(val)?, + "flag" => parse_flag(val)?, + "rated" => parse_rated(val)?, + "dupe" => parse_dupes(val)?, + "prop" => parse_prop(val)?, + "re" => SearchNode::Regex(unescape_quotes(val)), + "nc" => SearchNode::NoCombining(unescape_to_glob(val)?), + "w" => SearchNode::WordBoundary(unescape_to_enforced_re(val)?), // anything else is a field search - _ => parse_single_field(key.as_ref(), val.as_ref()), + _ => parse_single_field(key, val)?, }) } +/// Ensure the string doesn't contain whitespace and unescape. +fn parse_tag(s: &str) -> ParseResult { + if s.as_bytes().iter().any(u8::is_ascii_whitespace) { + Err(ParseError {}) + } else { + unescape_to_custom_re(s, r"\S") + } +} + /// ensure a list of ids contains only numbers and commas, returning unchanged if true /// used by nid: and cid: -fn check_id_list(s: Cow) -> ParseResult> { +fn check_id_list(s: &str) -> ParseResult<&str> { lazy_static! { static ref RE: Regex = Regex::new(r"^(\d+,)*\d+$").unwrap(); } - if RE.is_match(s.as_ref()) { + if RE.is_match(s) { Ok(s) } else { Err(ParseError {}) @@ -360,13 +362,13 @@ fn parse_rated(val: &str) -> ParseResult> { } /// eg dupes:1231,hello -fn parse_dupes(val: &str) -> ParseResult> { +fn parse_dupes(val: &str) -> ParseResult { let mut it = val.splitn(2, ','); let mid: NoteTypeID = it.next().unwrap().parse()?; let text = it.next().ok_or(ParseError {})?; Ok(SearchNode::Duplicates { note_type_id: mid, - text: text.into(), + text: unescape_quotes(text), }) } @@ -411,27 +413,122 @@ fn parse_prop(val: &str) -> ParseResult> { }) } -fn parse_template(val: &str) -> SearchNode<'static> { - SearchNode::CardTemplate(match val.parse::() { +fn parse_template(val: &str) -> ParseResult { + Ok(SearchNode::CardTemplate(match val.parse::() { Ok(n) => TemplateKind::Ordinal(n.max(1) - 1), - Err(_) => TemplateKind::Name(val.into()), + Err(_) => TemplateKind::Name(unescape_to_re(val)?), + })) +} + +fn parse_single_field<'a>(key: &'a str, val: &'a str) -> ParseResult> { + Ok(if val.starts_with("re:") { + SearchNode::SingleField { + field: unescape_to_re(key)?, + text: unescape_quotes(&val[3..]), + is_re: true, + } + } else { + SearchNode::SingleField { + field: unescape_to_re(key)?, + text: unescape_to_glob(val)?, + is_re: false, + } }) } -fn parse_single_field(key: &str, mut val: &str) -> SearchNode<'static> { - let is_re = if val.starts_with("re:") { - val = val.trim_start_matches("re:"); - true +/// For strings without unescaped ", convert \" to " +fn unescape_quotes(s: &str) -> Cow { + if s.contains('"') { + s.replace(r#"\""#, "\"").into() } else { - false - }; - SearchNode::SingleField { - field: key.to_string().into(), - text: val.to_string().into(), - is_re, + s.into() } } +/// Check string for invalid escape sequences. +fn is_invalid_escape(txt: &str) -> bool { + // odd number of \s not followed by an escapable character + lazy_static! { + static ref RE: Regex = Regex::new(r#"(^|[^\\])(\\\\)*\\([^":*_()]|$)"#).unwrap(); + } + RE.is_match(txt) +} + +/// Handle escaped characters and convert Anki wildcards to SQL wildcards. +/// Return error if there is an undefined escape sequence. +fn unescape_to_glob(txt: &str) -> ParseResult> { + if is_invalid_escape(txt) { + Err(ParseError {}) + } else { + // escape sequences and unescaped special characters which need conversion + lazy_static! { + static ref RE: Regex = Regex::new(r"\\.|[*%]").unwrap(); + } + Ok(RE.replace_all(&txt, |caps: &Captures| { + match &caps[0] { + r"\\" => r"\\", + "\\\"" => "\"", + r"\:" => ":", + r"\*" => "*", + r"\_" => r"\_", + r"\(" => "(", + r"\)" => ")", + "*" => "%", + "%" => r"\%", + _ => unreachable!(), + } + })) + } +} + +/// Handle escaped characters and convert to regex if there are wildcards. +/// Return error if there is an undefined escape sequence. +fn unescape_to_re(txt: &str) -> ParseResult { + unescape_to_custom_re(txt, ".") +} + +/// Handle escaped characters and if there are wildcards, convert to a regex using the given wildcard. +/// Return error if there is an undefined escape sequence. +fn unescape_to_custom_re<'a>(txt: &'a str, wildcard: &str) -> ParseResult> { + if is_invalid_escape(txt) { + Err(ParseError {}) + } else { + lazy_static! { + static ref WILDCARD: Regex = Regex::new(r"(^|[^\\])(\\\\)*[*_]").unwrap(); + static ref MAYBE_ESCAPED: Regex = Regex::new(r"\\?.").unwrap(); + static ref ESCAPED: Regex = Regex::new(r"\\(.)").unwrap(); + } + if WILDCARD.is_match(txt) { + Ok(OptionalRe::Re(MAYBE_ESCAPED.replace_all( + &txt, + |caps: &Captures| { + let s = &caps[0]; + match s { + r"\\" | r"\*" | r"\(" | r"\)" => s.to_string(), + "\\\"" => "\"".to_string(), + r"\:" => ":".to_string(), + r"*" => format!("{}*", wildcard), + "_" => wildcard.to_string(), + r"\_" => r"_".to_string(), + s => regex::escape(s), + } + }, + ))) + } else { + Ok(OptionalRe::Text(ESCAPED.replace_all(&txt, "$1"))) + } + } +} + +/// Handle escaped characters and convert to regex. +/// Return error if there is an undefined escape sequence. +fn unescape_to_enforced_re(txt: &str) -> ParseResult { + Ok(match unescape_to_re(txt)? { + OptionalRe::Text(s) => regex::escape(s.as_ref()), + OptionalRe::Re(s) => s.to_string(), + }) +} + #[cfg(test)] mod test { use super::*; @@ -440,6 +537,7 @@ mod test { fn parsing() -> Result<()> { use Node::*; use SearchNode::*; + use OptionalRe::*; assert_eq!(parse("")?, vec![Search(SearchNode::WholeCollection)]); assert_eq!(parse(" ")?, vec![Search(SearchNode::WholeCollection)]); @@ -478,7 +576,7 @@ mod test { Search(UnqualifiedText("world".into())), And, Search(SingleField { - field: "foo".into(), + field: Text("foo".into()), text: "bar baz".into(), is_re: false, }) @@ -491,7 +589,7 @@ mod test { assert_eq!( parse("foo:re:bar")?, vec![Search(SingleField { - field: "foo".into(), + field: Text("foo".into()), text: "bar".into(), is_re: true })] @@ -501,7 +599,7 @@ mod test { assert_eq!( parse(r#""field:va\"lue""#)?, vec![Search(SingleField { - field: "field".into(), + field: Text("foo".into()), text: "va\"lue".into(), is_re: false })] @@ -517,7 +615,7 @@ mod test { assert_eq!(parse("added:3")?, vec![Search(AddedInDays(3))]); assert_eq!( parse("card:front")?, - vec![Search(CardTemplate(TemplateKind::Name("front".into())))] + vec![Search(CardTemplate(TemplateKind::Name(Text("front".into()))))] ); assert_eq!( parse("card:3")?, diff --git a/rslib/src/search/sqlwriter.rs b/rslib/src/search/sqlwriter.rs index 611f3adce..9f17c555c 100644 --- a/rslib/src/search/sqlwriter.rs +++ b/rslib/src/search/sqlwriter.rs @@ -1,7 +1,7 @@ // Copyright: Ankitects Pty Ltd and contributors // License: GNU AGPL, version 3 or later; http://www.gnu.org/licenses/agpl.html -use super::parser::{Node, PropertyKind, SearchNode, StateKind, TemplateKind}; +use super::parser::{Node, PropertyKind, SearchNode, StateKind, OptionalRe, TemplateKind}; use crate::{ card::{CardQueue, CardType}, collection::Collection, @@ -9,13 +9,13 @@ use crate::{ err::Result, notes::field_checksum, notetype::NoteTypeID, - text::{matches_wildcard, text_to_re}, + text::text_to_re, text::{normalize_to_nfc, strip_html_preserving_image_filenames, without_combining}, timestamp::TimestampSecs, }; -use lazy_static::lazy_static; use regex::Regex; use std::{borrow::Cow, fmt::Write}; +use unicase::eq as uni_eq; pub(crate) struct SqlWriter<'a> { col: &'a mut Collection, @@ -119,7 +119,7 @@ impl SqlWriter<'_> { // note fields related SearchNode::UnqualifiedText(text) => self.write_unqualified(&self.norm_note(text)), SearchNode::SingleField { field, text, is_re } => { - self.write_single_field(field.as_ref(), &self.norm_note(text), *is_re)? + self.write_single_field(field, &self.norm_note(text), *is_re)? } SearchNode::Duplicates { note_type_id, text } => { self.write_dupes(*note_type_id, &self.norm_note(text)) @@ -131,14 +131,8 @@ impl SqlWriter<'_> { // other SearchNode::AddedInDays(days) => self.write_added(*days)?, SearchNode::EditedInDays(days) => self.write_edited(*days)?, - SearchNode::CardTemplate(template) => match template { - TemplateKind::Ordinal(_) => { - self.write_template(template)?; - } - TemplateKind::Name(name) => { - self.write_template(&TemplateKind::Name(norm(name).into()))?; - } - }, + // fixme: normalise in name case? + SearchNode::CardTemplate(template) => self.write_template(template)?, SearchNode::Deck(deck) => self.write_deck(&norm(deck))?, SearchNode::NoteTypeID(ntid) => { write!(self.sql, "n.mid = {}", ntid).unwrap(); @@ -146,9 +140,12 @@ impl SqlWriter<'_> { SearchNode::DeckID(did) => { write!(self.sql, "c.did = {}", did).unwrap(); } - SearchNode::NoteType(notetype) => self.write_note_type(&norm(notetype))?, + // fixme: normalise? + SearchNode::NoteType(notetype) => self.write_note_type(notetype)?, SearchNode::Rated { days, ease } => self.write_rated(*days, *ease)?, - SearchNode::Tag(tag) => self.write_tag(&norm(tag))?, + + // fixme: normalise? + SearchNode::Tag(tag) => self.write_tag(tag)?, SearchNode::State(state) => self.write_state(state)?, SearchNode::Flag(flag) => { write!(self.sql, "(c.flags & 7) == {}", flag).unwrap(); @@ -167,7 +164,7 @@ impl SqlWriter<'_> { fn write_unqualified(&mut self, text: &str) { // implicitly wrap in % - let text = format!("%{}%", convert_glob_char(text)); + let text = format!("%{}%", text); self.args.push(text); write!( self.sql, @@ -191,27 +188,27 @@ impl SqlWriter<'_> { .unwrap(); } - fn write_tag(&mut self, text: &str) -> Result<()> { + fn write_tag(&mut self, text: &OptionalRe) -> Result<()> { match text { - "none" => { - write!(self.sql, "n.tags = ''").unwrap(); - } - "*" | "%" => { - write!(self.sql, "true").unwrap(); - } - text => { - if let Some(re_glob) = glob_to_re(text) { - // text contains a wildcard - let re_glob = format!("(?i).* {} .*", re_glob); - write!(self.sql, "n.tags regexp ?").unwrap(); - self.args.push(re_glob); - } else if let Some(tag) = self.col.storage.preferred_tag_case(&text)? { + OptionalRe::Text(s) => { + if s == "none" { + write!(self.sql, "n.tags = ''").unwrap(); + } else if let Some(tag) = self.col.storage.preferred_tag_case(s)? { write!(self.sql, "n.tags like ?").unwrap(); self.args.push(format!("% {} %", tag)); } else { write!(self.sql, "false").unwrap(); } } + OptionalRe::Re(s) => { + if s == "*" { + write!(self.sql, "true").unwrap(); + } else { + let re = format!("(?i).* {} .*", s); + write!(self.sql, "n.tags regexp ?").unwrap(); + self.args.push(re); + } + } } Ok(()) } @@ -340,45 +337,54 @@ impl SqlWriter<'_> { TemplateKind::Ordinal(n) => { write!(self.sql, "c.ord = {}", n).unwrap(); } - TemplateKind::Name(name) => { - if let Some(re) = glob_to_re(name) { - let re = format!("(?i){}", re); + TemplateKind::Name(name) => match name { + OptionalRe::Re(s) => { + let re = format!("(?i){}", s); self.sql.push_str( "(n.mid,c.ord) in (select ntid,ord from templates where name regexp ?)", ); self.args.push(re); - } else { + } + OptionalRe::Text(s) => { self.sql.push_str( "(n.mid,c.ord) in (select ntid,ord from templates where name = ?)", ); - self.args.push(name.to_string()); + self.args.push(s.to_string()); } - } + }, }; Ok(()) } - fn write_note_type(&mut self, nt_name: &str) -> Result<()> { - if let Some(re) = glob_to_re(nt_name) { - let re = format!("(?i){}", re); - self.sql - .push_str("n.mid in (select id from notetypes where name regexp ?)"); - self.args.push(re); - } else { - self.sql - .push_str("n.mid in (select id from notetypes where name = ?)"); - self.args.push(nt_name.to_string()); + fn write_note_type(&mut self, nt_name: &OptionalRe) -> Result<()> { + match nt_name { + OptionalRe::Re(s) => { + let re = format!("(?i){}", s); + self.sql + .push_str("n.mid in (select id from notetypes where name regexp ?)"); + self.args.push(re); + } + OptionalRe::Text(s) => { + self.sql + .push_str("n.mid in (select id from notetypes where name = ?)"); + self.args.push(s.to_string()); + } } Ok(()) } - fn write_single_field(&mut self, field_name: &str, val: &str, is_re: bool) -> Result<()> { + fn write_single_field( + &mut self, + field_name: &OptionalRe, + val: &str, + is_re: bool, + ) -> Result<()> { let note_types = self.col.get_all_notetypes()?; let mut field_map = vec![]; for nt in note_types.values() { for field in &nt.fields { - if matches_wildcard(&field.name, field_name) { + if matches_string_variant(&field.name, field_name) { field_map.push((nt.id, field.ord)); } } @@ -401,7 +407,7 @@ impl SqlWriter<'_> { } else { cmp = "like"; cmp_trailer = "escape '\\'"; - self.args.push(convert_glob_char(val).into()) + self.args.push(val.into()) } let arg_idx = self.args.len(); @@ -455,27 +461,16 @@ impl SqlWriter<'_> { } fn write_word_boundary(&mut self, word: &str) { - // fixme: need to escape in the no-glob case as well - let re = text_to_re(word); - self.write_regex(&format!(r"\b{}\b", re)) + self.write_regex(&format!(r"\b{}\b", word)) } } -/// Replace * with %, leaving \* alone. -fn convert_glob_char(val: &str) -> Cow { - lazy_static! { - static ref RE: Regex = Regex::new(r"(^|[^\\])\*").unwrap(); +/// True if the content of search is equal to text, folding case. +fn matches_string_variant(text: &str, search: &OptionalRe) -> bool { + match search { + OptionalRe::Re(s) => Regex::new(&format!("^(?i){}$", s)).unwrap().is_match(text), + OptionalRe::Text(s) => uni_eq(text, s), } - RE.replace_all(val, "${1}%") -} - -/// Convert a string with _, % or * characters into a regex. -/// If string contains no globbing characters, return None. -fn glob_to_re(glob: &str) -> Option { - if !glob.contains(|c| c == '_' || c == '*' || c == '%') { - return None; - } - Some(text_to_re(glob)) } #[derive(Debug, PartialEq, Clone, Copy)] @@ -803,12 +798,4 @@ mod test { RequiredTable::Notes ); } - - #[test] - fn convert_glob() { - assert_eq!(&convert_glob_char("foo*bar"), "foo%bar"); - assert_eq!(&convert_glob_char("*bar"), "%bar"); - assert_eq!(&convert_glob_char("\n*bar"), "\n%bar"); - assert_eq!(&convert_glob_char(r"\*bar"), r"\*bar"); - } } diff --git a/rslib/src/text.rs b/rslib/src/text.rs index 0793281c0..b24b838c9 100644 --- a/rslib/src/text.rs +++ b/rslib/src/text.rs @@ -5,7 +5,6 @@ use lazy_static::lazy_static; use regex::{Captures, Regex}; use std::borrow::Cow; use std::ptr; -use unicase::eq as uni_eq; use unicode_normalization::{ char::is_combining_mark, is_nfc, is_nfkd_quick, IsNormalized, UnicodeNormalization, }; @@ -240,17 +239,6 @@ pub(crate) fn ensure_string_in_nfc(s: &mut String) { } } -/// True if search is equal to text, folding case. -/// Supports '*' to match 0 or more characters. -pub(crate) fn matches_wildcard(text: &str, search: &str) -> bool { - if search.contains('*') { - let search = format!("^(?i){}$", regex::escape(search).replace(r"\*", ".*")); - Regex::new(&search).unwrap().is_match(text) - } else { - uni_eq(text, search) - } -} - /// Convert provided string to NFKD form and strip combining characters. pub(crate) fn without_combining(s: &str) -> Cow { // if the string is already normalized @@ -303,7 +291,6 @@ pub(crate) fn text_to_re(glob: &str) -> String { #[cfg(test)] mod test { - use super::matches_wildcard; use crate::text::without_combining; use crate::text::{ extract_av_tags, strip_av_tags, strip_html, strip_html_preserving_image_filenames, AVTag, @@ -351,15 +338,6 @@ mod test { ); } - #[test] - fn wildcard() { - assert_eq!(matches_wildcard("foo", "bar"), false); - assert_eq!(matches_wildcard("foo", "Foo"), true); - assert_eq!(matches_wildcard("foo", "F*"), true); - assert_eq!(matches_wildcard("foo", "F*oo"), true); - assert_eq!(matches_wildcard("foo", "b*"), false); - } - #[test] fn combining() { assert!(matches!(without_combining("test"), Cow::Borrowed(_))); From b186e61e549b8f0322fecee3aa97ebb246869ee4 Mon Sep 17 00:00:00 2001 From: RumovZ Date: Sat, 14 Nov 2020 18:28:24 +0100 Subject: [PATCH 02/20] Fix 'escaped' parser for empty string Fix a bug where 'escaped' parsers (nom) accepted the empty string by wrapping them in 'verify' parsers. --- rslib/src/search/parser.rs | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/rslib/src/search/parser.rs b/rslib/src/search/parser.rs index 44315af0d..37ff504f8 100644 --- a/rslib/src/search/parser.rs +++ b/rslib/src/search/parser.rs @@ -11,7 +11,7 @@ use nom::{ branch::alt, bytes::complete::{escaped, is_not, tag}, character::complete::{anychar, char, none_of, one_of}, - combinator::{all_consuming, map, map_res}, + combinator::{all_consuming, map, map_res, verify}, sequence::{delimited, preceded, separated_pair}, {multi::many0, IResult}, }; @@ -220,7 +220,10 @@ fn search_node_for_text(s: &str) -> ParseResult { /// Unquoted text, terminated by whitespace or unescaped ", ( or ) fn unquoted_term(s: &str) -> IResult<&str, Node> { map_res( + verify( escaped(is_not("\"() \u{3000}\\"), '\\', none_of(" \u{3000}")), + |s: &str| !s.is_empty(), + ), |text: &str| -> ParseResult { Ok(if text.eq_ignore_ascii_case("or") { Node::Or @@ -246,14 +249,19 @@ fn quoted_term_str(s: &str) -> IResult<&str, &str> { /// Quoted text, terminated by a non-escaped double quote fn quoted_term_inner(s: &str) -> IResult<&str, &str> { - escaped(is_not(r#""\"#), '\\', anychar)(s) + verify(escaped(is_not(r#""\"#), '\\', anychar), |s: &str| { + !s.is_empty() + })(s) } /// eg deck:"foo bar" - quotes must come after the : fn partially_quoted_term(s: &str) -> IResult<&str, Node> { map_res( separated_pair( + verify( escaped(is_not("\"(): \u{3000}\\"), '\\', none_of(": \u{3000}")), + |s: &str| !s.is_empty(), + ), char(':'), quoted_term_str, ), From 39499967285714ed30bf1ed9274b926a06583b49 Mon Sep 17 00:00:00 2001 From: RumovZ Date: Sat, 14 Nov 2020 18:32:41 +0100 Subject: [PATCH 03/20] For deck searches, unescape quotes only For now, revert to the old handling of deck names using text_to_re from text.rs and have parser.rs only unescape quotes. --- rslib/src/search/parser.rs | 4 ++-- rslib/src/search/sqlwriter.rs | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/rslib/src/search/parser.rs b/rslib/src/search/parser.rs index 37ff504f8..54fb8c553 100644 --- a/rslib/src/search/parser.rs +++ b/rslib/src/search/parser.rs @@ -69,7 +69,7 @@ pub(super) enum SearchNode<'a> { AddedInDays(u32), EditedInDays(u32), CardTemplate(TemplateKind<'a>), - Deck(String), + Deck(Cow<'a, str>), DeckID(DeckID), NoteTypeID(NoteTypeID), NoteType(OptionalRe<'a>), @@ -280,7 +280,7 @@ fn search_node_for_text_with_argument<'a>( Ok(match key.to_ascii_lowercase().as_str() { "added" => SearchNode::AddedInDays(val.parse()?), "edited" => SearchNode::EditedInDays(val.parse()?), - "deck" => SearchNode::Deck(unescape_to_enforced_re(val)?), + "deck" => SearchNode::Deck(unescape_quotes(val)), "note" => SearchNode::NoteType(unescape_to_re(val)?), "tag" => SearchNode::Tag(parse_tag(val)?), "mid" => SearchNode::NoteTypeID(val.parse()?), diff --git a/rslib/src/search/sqlwriter.rs b/rslib/src/search/sqlwriter.rs index 9f17c555c..5ea422b04 100644 --- a/rslib/src/search/sqlwriter.rs +++ b/rslib/src/search/sqlwriter.rs @@ -319,6 +319,7 @@ impl SqlWriter<'_> { }; // convert to a regex that includes child decks + // fixme: use unescape_to_enforced_re from parser.rs? let re = text_to_re(&native_deck); self.args.push(format!("(?i)^{}($|\x1f)", re)); let arg_idx = self.args.len(); From 836977aac8cc2ac17d286eaa294fb8b5b9f52508 Mon Sep 17 00:00:00 2001 From: RumovZ Date: Sat, 14 Nov 2020 19:10:56 +0100 Subject: [PATCH 04/20] Fix whitespace in write_tag and parser/whitespac0 --- rslib/src/search/parser.rs | 2 +- rslib/src/search/sqlwriter.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/rslib/src/search/parser.rs b/rslib/src/search/parser.rs index 54fb8c553..1123cd701 100644 --- a/rslib/src/search/parser.rs +++ b/rslib/src/search/parser.rs @@ -188,7 +188,7 @@ fn group_inner(input: &str) -> IResult<&str, Vec> { } fn whitespace0(s: &str) -> IResult<&str, Vec> { - many0(one_of(" \u{3000}\t\n"))(s) + many0(one_of(" \u{3000}"))(s) } /// Optional leading space, then a (negated) group or text diff --git a/rslib/src/search/sqlwriter.rs b/rslib/src/search/sqlwriter.rs index 5ea422b04..e062bab01 100644 --- a/rslib/src/search/sqlwriter.rs +++ b/rslib/src/search/sqlwriter.rs @@ -201,7 +201,7 @@ impl SqlWriter<'_> { } } OptionalRe::Re(s) => { - if s == "*" { + if s == r"\S*" { write!(self.sql, "true").unwrap(); } else { let re = format!("(?i).* {} .*", s); From 9e5348100366da5a5aea3bdaccd55933c20deeb2 Mon Sep 17 00:00:00 2001 From: RumovZ Date: Sat, 14 Nov 2020 19:13:09 +0100 Subject: [PATCH 05/20] Fix tests for new search parsing (and reformat) --- rslib/src/search/parser.rs | 50 ++++++++++++++++++----------------- rslib/src/search/sqlwriter.rs | 13 +++++---- 2 files changed, 32 insertions(+), 31 deletions(-) diff --git a/rslib/src/search/parser.rs b/rslib/src/search/parser.rs index 1123cd701..e4f4cdb93 100644 --- a/rslib/src/search/parser.rs +++ b/rslib/src/search/parser.rs @@ -18,7 +18,6 @@ use nom::{ use regex::{Captures, Regex}; use std::{borrow::Cow, num}; - struct ParseError {} impl From for ParseError { @@ -221,7 +220,7 @@ fn search_node_for_text(s: &str) -> ParseResult { fn unquoted_term(s: &str) -> IResult<&str, Node> { map_res( verify( - escaped(is_not("\"() \u{3000}\\"), '\\', none_of(" \u{3000}")), + escaped(is_not("\"() \u{3000}\\"), '\\', none_of(" \u{3000}")), |s: &str| !s.is_empty(), ), |text: &str| -> ParseResult { @@ -259,7 +258,7 @@ fn partially_quoted_term(s: &str) -> IResult<&str, Node> { map_res( separated_pair( verify( - escaped(is_not("\"(): \u{3000}\\"), '\\', none_of(": \u{3000}")), + escaped(is_not("\"(): \u{3000}\\"), '\\', none_of(": \u{3000}")), |s: &str| !s.is_empty(), ), char(':'), @@ -472,19 +471,17 @@ fn unescape_to_glob(txt: &str) -> ParseResult> { lazy_static! { static ref RE: Regex = Regex::new(r"\\.|[*%]").unwrap(); } - Ok(RE.replace_all(&txt, |caps: &Captures| { - match &caps[0] { - r"\\" => r"\\", - "\\\"" => "\"", - r"\:" => ":", - r"\*" => "*", - r"\_" => r"\_", - r"\(" => "(", - r"\)" => ")", - "*" => "%", - "%" => r"\%", - _ => unreachable!(), - } + Ok(RE.replace_all(&txt, |caps: &Captures| match &caps[0] { + r"\\" => r"\\", + "\\\"" => "\"", + r"\:" => ":", + r"\*" => "*", + r"\_" => r"\_", + r"\(" => "(", + r"\)" => ")", + "*" => "%", + "%" => r"\%", + _ => unreachable!(), })) } } @@ -512,12 +509,12 @@ fn unescape_to_custom_re<'a>(txt: &'a str, wildcard: &str) -> ParseResult