Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Etolbakov fix incompatible es date format #5237

Draft
wants to merge 3 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
447 changes: 224 additions & 223 deletions quickwit/Cargo.lock

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion quickwit/quickwit-datetime/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ license.workspace = true
[dependencies]
anyhow = { workspace = true }
itertools = { workspace = true }
ouroboros = "0.18.0"
regex = { workspace = true }
serde = { workspace = true }
serde_json = { workspace = true }
tantivy = { workspace = true }
Expand Down
153 changes: 113 additions & 40 deletions quickwit/quickwit-datetime/src/date_time_format.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,60 +19,123 @@

use std::fmt::Display;
use std::str::FromStr;
use std::sync::OnceLock;

use ouroboros::self_referencing;
use serde::de::Error;
use serde::{Deserialize, Deserializer, Serialize};
use serde_json::Value as JsonValue;
use time::error::Format;
use time::format_description::modifier::{Day, Month as MonthModifier, Padding, Year, YearRepr};
use time::format_description::well_known::{Iso8601, Rfc2822, Rfc3339};
use time::format_description::FormatItem;
use time::format_description::{Component, OwnedFormatItem};
use time::parsing::Parsed;
use time::{Month, OffsetDateTime, PrimitiveDateTime};
use time_fmt::parse::time_format_item::parse_to_format_item;

use crate::TantivyDateTime;
use crate::{RegexTokenizer, TantivyDateTime};

fn literal(s: &[u8]) -> OwnedFormatItem{
// builds a boxed slice from a slice
let boxed_slice: Box<[u8]> = s.to_vec().into_boxed_slice();
OwnedFormatItem::Literal(boxed_slice)
}

fn build_month_item(ptn: &str) -> Option<OwnedFormatItem> {
let mut month: MonthModifier = Default::default();
if ptn.len() == 2 {
month.padding = Padding::Zero;
} else {
month.padding = Padding::None;
}
Some(OwnedFormatItem::Component(Component::Month(month)))
}

fn build_year_item(ptn: &str) -> Option<OwnedFormatItem> {
let year_repr = if ptn.len() == 4 {
YearRepr::Full
} else {
YearRepr::LastTwo
};
let mut year = Year::default();
year.repr = year_repr;
Some(OwnedFormatItem::Component(Component::Year(year)))
}

fn build_day_item(ptn: &str) -> Option<OwnedFormatItem> {
let mut day = Day::default();
if ptn.len() == 2 {
day.padding = Padding::Zero;
} else {
day.padding = Padding::None;
};
Some(OwnedFormatItem::Component(Component::Day(day)))
}

// Elasticsearch/OpenSearch uses a set of preconfigured formats, more information could be found
// here https://www.elastic.co/guide/en/elasticsearch/reference/current/mapping-date-format.html
fn java_date_format_tokenizer() -> &'static RegexTokenizer<OwnedFormatItem> {
static JAVA_DATE_FORMAT_TOKENIZER: OnceLock<RegexTokenizer<OwnedFormatItem>> = OnceLock::new();
&*JAVA_DATE_FORMAT_TOKENIZER.get_or_init(|| {
super::RegexTokenizer::new(vec![
(r#"yy(yy)?"#, build_year_item),
(r#"MM?"#, build_month_item),
(r#"dd?"#, build_day_item),
(r#"''"#, |_| { Some(literal(b"'")) }),
(r#"'[^']+'"#, |s| { Some(literal(s[1..s.len() - 1].as_bytes())) }),
(r#"[^\w\[\]{}]"#, |s| { Some(literal(s.as_bytes())) }),
]).unwrap()
})
}

/// A date time parser that holds the format specification `Vec<FormatItem>`.
#[self_referencing]
#[derive(Clone)]
pub struct StrptimeParser {
strptime_format: String,
with_timezone: bool,
#[borrows(strptime_format)]
#[covariant]
items: Vec<FormatItem<'this>>,
items: Vec<OwnedFormatItem>,
}

impl FromStr for StrptimeParser {
type Err = String;
impl StrptimeParser {

fn from_str(strptime_format: &str) -> Result<Self, Self::Err> {
StrptimeParser::try_new(
strptime_format.to_string(),
strptime_format.to_lowercase().contains("%z"),
|strptime_format: &String| {
parse_to_format_item(strptime_format).map_err(|error| {
format!("invalid strptime format `{strptime_format}`: {error}")
})
},
)
pub fn from_strptime(strptime_format: &str) -> Result<StrptimeParser, String> {
let items: Vec<OwnedFormatItem> = parse_to_format_item(strptime_format)
.map_err(|error| {
format!("invalid strptime format `{strptime_format}`: {error}")
})?
.into_iter()
.map(|item| item.into())
.collect();
Ok(StrptimeParser {
strptime_format: strptime_format.to_string(),
with_timezone: strptime_format.to_lowercase().contains("%z"),
items
})
}

pub fn from_java_datetime_format(java_datetime_format: &str) -> Result<StrptimeParser, String> {
let items = java_date_format_tokenizer().tokenize(java_datetime_format).map_err(|pos| {
format!("failed to parse date format `{java_datetime_format}`. Pattern at pos {pos} is not recognized.")
})?;
Ok(StrptimeParser {
strptime_format: java_datetime_format.to_string(),
with_timezone: false,
items,
})
}
}

impl StrptimeParser {
/// Parse a given date according to the datetime format specified during the StrptimeParser
/// creation. If the date format does not provide a specific a time, the time will be set to
/// 00:00:00.
fn parse_primitive_date_time(&self, date_time_str: &str) -> anyhow::Result<PrimitiveDateTime> {
let mut parsed = Parsed::new();
if !parsed
.parse_items(date_time_str.as_bytes(), self.borrow_items())?
.parse_items(date_time_str.as_bytes(), &self.items)?
.is_empty()
{
anyhow::bail!(
"datetime string `{}` does not match strptime format `{}`",
date_time_str,
self.borrow_strptime_format()
&self.strptime_format
);
}
// The parsed datetime contains a date but seems to be missing "time".
Expand All @@ -94,8 +157,8 @@ impl StrptimeParser {
}

pub fn parse_date_time(&self, date_time_str: &str) -> Result<OffsetDateTime, String> {
if *self.borrow_with_timezone() {
OffsetDateTime::parse(date_time_str, self.borrow_items()).map_err(|err| err.to_string())
if self.with_timezone {
OffsetDateTime::parse(date_time_str, &self.items).map_err(|err| err.to_string())
} else {
self.parse_primitive_date_time(date_time_str)
.map(|date_time| date_time.assume_utc())
Expand All @@ -104,20 +167,14 @@ impl StrptimeParser {
}

pub fn format_date_time(&self, date_time: &OffsetDateTime) -> Result<String, Format> {
date_time.format(self.borrow_items())
date_time.format(&self.items)
}
}

impl Clone for StrptimeParser {
fn clone(&self) -> Self {
// `self.format` is already known to be a valid format.
Self::from_str(self.borrow_strptime_format().as_str()).unwrap()
}
}

impl PartialEq for StrptimeParser {
fn eq(&self, other: &Self) -> bool {
self.borrow_strptime_format() == other.borrow_strptime_format()
self.strptime_format == other.strptime_format
}
}

Expand All @@ -127,14 +184,14 @@ impl std::fmt::Debug for StrptimeParser {
fn fmt(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
formatter
.debug_struct("StrptimeParser")
.field("format", &self.borrow_strptime_format())
.field("format", &self.strptime_format)
.finish()
}
}

impl std::hash::Hash for StrptimeParser {
fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
self.borrow_strptime_format().hash(state);
self.strptime_format.hash(state);
}
}

Expand Down Expand Up @@ -170,7 +227,7 @@ impl DateTimeInputFormat {
DateTimeInputFormat::Iso8601 => "iso8601",
DateTimeInputFormat::Rfc2822 => "rfc2822",
DateTimeInputFormat::Rfc3339 => "rfc3339",
DateTimeInputFormat::Strptime(parser) => parser.borrow_strptime_format(),
DateTimeInputFormat::Strptime(parser) => parser.strptime_format.as_str(),
DateTimeInputFormat::Timestamp => "unix_timestamp",
}
}
Expand Down Expand Up @@ -198,7 +255,7 @@ impl FromStr for DateTimeInputFormat {
format must contain at least one `strftime` special characters"
));
}
DateTimeInputFormat::Strptime(StrptimeParser::from_str(date_time_format_str)?)
DateTimeInputFormat::Strptime(StrptimeParser::from_strptime(date_time_format_str)?)
}
};
Ok(date_time_format)
Expand Down Expand Up @@ -241,7 +298,7 @@ impl DateTimeOutputFormat {
DateTimeOutputFormat::Iso8601 => "iso8601",
DateTimeOutputFormat::Rfc2822 => "rfc2822",
DateTimeOutputFormat::Rfc3339 => "rfc3339",
DateTimeOutputFormat::Strptime(parser) => parser.borrow_strptime_format(),
DateTimeOutputFormat::Strptime(parser) => parser.strptime_format.as_str(),
DateTimeOutputFormat::TimestampSecs => "unix_timestamp_secs",
DateTimeOutputFormat::TimestampMillis => "unix_timestamp_millis",
DateTimeOutputFormat::TimestampMicros => "unix_timestamp_micros",
Expand Down Expand Up @@ -300,7 +357,7 @@ impl FromStr for DateTimeOutputFormat {
format must contain at least one `strftime` special characters"
));
}
DateTimeOutputFormat::Strptime(StrptimeParser::from_str(date_time_format_str)?)
DateTimeOutputFormat::Strptime(StrptimeParser::from_strptime(date_time_format_str)?)
}
};
Ok(date_time_format)
Expand Down Expand Up @@ -464,7 +521,7 @@ mod tests {

#[test]
fn test_strictly_parse_datetime_format() {
let parser = StrptimeParser::from_str("%Y-%m-%d").unwrap();
let parser = StrptimeParser::from_strptime("%Y-%m-%d").unwrap();
assert_eq!(
parser.parse_date_time("2021-01-01").unwrap(),
datetime!(2021-01-01 00:00:00 UTC)
Expand All @@ -476,6 +533,22 @@ mod tests {
);
}


#[test]
fn test_parse_java_datetime_format() {
let parser = StrptimeParser::from_java_datetime_format("yyyy MM dd").unwrap();
assert_eq!(
parser.parse_date_time("2021 01 01").unwrap(),
datetime!(2021-01-01 00:00:00 UTC)
);

let parser = StrptimeParser::from_java_datetime_format("yyyy!MM?dd").unwrap();
assert_eq!(
parser.parse_date_time("2021!01?01").unwrap(),
datetime!(2021-01-01 00:00:00 UTC)
);
}

#[test]
fn test_infer_year() {
let inferred_year = infer_year(None, Month::January, 2024);
Expand Down
16 changes: 7 additions & 9 deletions quickwit/quickwit-datetime/src/date_time_parsing.rs
Original file line number Diff line number Diff line change
Expand Up @@ -179,8 +179,6 @@ pub fn parse_timestamp(timestamp: i64) -> Result<TantivyDateTime, String> {

#[cfg(test)]
mod tests {
use std::str::FromStr;

use time::macros::datetime;
use time::Month;

Expand Down Expand Up @@ -262,7 +260,7 @@ mod tests {
),
];
for (fmt, date_time_str, expected) in test_data {
let parser = StrptimeParser::from_str(fmt).unwrap();
let parser = StrptimeParser::from_strptime(fmt).unwrap();
let result = parser.parse_date_time(date_time_str);
if let Err(error) = &result {
panic!(
Expand All @@ -276,14 +274,14 @@ mod tests {

#[test]
fn test_parse_date_without_time() {
let strptime_parser = StrptimeParser::from_str("%Y-%m-%d").unwrap();
let strptime_parser = StrptimeParser::from_strptime("%Y-%m-%d").unwrap();
let date = strptime_parser.parse_date_time("2012-05-21").unwrap();
assert_eq!(date, datetime!(2012-05-21 00:00:00 UTC));
}

#[test]
fn test_parse_date_am_pm_hour_not_zeroed() {
let strptime_parser = StrptimeParser::from_str("%Y-%m-%d %I:%M:%S %p").unwrap();
let strptime_parser = StrptimeParser::from_strptime("%Y-%m-%d %I:%M:%S %p").unwrap();
let date = strptime_parser
.parse_date_time("2012-05-21 10:05:12 pm")
.unwrap();
Expand All @@ -309,13 +307,13 @@ mod tests {
DateTimeInputFormat::Rfc2822,
DateTimeInputFormat::Rfc3339,
DateTimeInputFormat::Strptime(
StrptimeParser::from_str("%Y-%m-%d %H:%M:%S").unwrap(),
StrptimeParser::from_strptime("%Y-%m-%d %H:%M:%S").unwrap(),
),
DateTimeInputFormat::Strptime(
StrptimeParser::from_str("%Y/%m/%d %H:%M:%S").unwrap(),
StrptimeParser::from_strptime("%Y/%m/%d %H:%M:%S").unwrap(),
),
DateTimeInputFormat::Strptime(
StrptimeParser::from_str("%Y/%m/%d %H:%M:%S %z").unwrap(),
StrptimeParser::from_strptime("%Y/%m/%d %H:%M:%S %z").unwrap(),
),
DateTimeInputFormat::Timestamp,
],
Expand Down Expand Up @@ -452,7 +450,7 @@ mod tests {
DateTimeInputFormat::Iso8601,
DateTimeInputFormat::Rfc3339,
DateTimeInputFormat::Strptime(
StrptimeParser::from_str("%Y-%m-%d %H:%M:%S.%f").unwrap(),
StrptimeParser::from_strptime("%Y-%m-%d %H:%M:%S.%f").unwrap(),
),
],
)
Expand Down
2 changes: 2 additions & 0 deletions quickwit/quickwit-datetime/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,9 @@

mod date_time_format;
mod date_time_parsing;
mod regex_tokenizer;

pub(crate) use regex_tokenizer::RegexTokenizer;
pub use date_time_format::{DateTimeInputFormat, DateTimeOutputFormat, StrptimeParser};
pub use date_time_parsing::{
parse_date_time_str, parse_timestamp, parse_timestamp_float, parse_timestamp_int,
Expand Down
Loading
Loading