Skip to content

Commit

Permalink
bugfix: concate short chunks to next one
Browse files Browse the repository at this point in the history
  • Loading branch information
densumesh committed Sep 19, 2024
1 parent fdd9bfa commit 42b5540
Show file tree
Hide file tree
Showing 2 changed files with 33 additions and 15 deletions.
3 changes: 1 addition & 2 deletions server/src/bin/crawl-worker.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ use trieve_server::{
data::models::{CrawlStatus, Pool},
errors::ServiceError,
establish_connection, get_env,
operators::crawl_operator::{get_images, get_tags, update_crawl_status},
operators::crawl_operator::{get_tags, update_crawl_status},
};
use trieve_server::{
handlers::chunk_handler::ChunkReqPayload, operators::crawl_operator::chunk_html,
Expand Down Expand Up @@ -128,7 +128,6 @@ async fn crawl(
chunk_html: Some(chunk_html.clone()),
link: Some(page_link.clone()),
tag_set: Some(page_tags.clone()),
image_urls: Some(get_images(&chunk_html.clone())),
metadata: Some(json!({
"title": page_title.clone(),
"description": page_description.clone(),
Expand Down
45 changes: 32 additions & 13 deletions server/src/operators/crawl_operator.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ use regex::Regex;
use reqwest::Url;
use serde::{Deserialize, Serialize};

use super::parse_operator::convert_html_to_text;

#[derive(Debug, Serialize, Deserialize, Clone)]
pub struct IngestResult {
pub status: Status,
Expand Down Expand Up @@ -414,6 +416,7 @@ pub async fn get_crawl_from_firecrawl(scrape_id: uuid::Uuid) -> Result<IngestRes
log::error!("Error parsing response from firecrawl: {:?}", e);
ServiceError::InternalServerError("Error parsing response from firecrawl".to_string())
})?;

if ingest_result.status != Status::Completed {
log::info!("Crawl status: {:?}", ingest_result.status);
return Ok(ingest_result);
Expand Down Expand Up @@ -511,34 +514,42 @@ pub fn get_tags(url: String) -> Vec<String> {
Vec::new()
}

pub fn get_images(markdown_content: &str) -> Vec<String> {
let image_pattern = Regex::new(r"\((https?://.*?\.(?:png|jpg|jpeg|gif|bmp|webp))\)").unwrap();
image_pattern
.captures_iter(markdown_content)
.filter_map(|cap| cap.get(1))
.map(|m| m.as_str().to_string())
.collect()
}

pub fn chunk_html(html: &str) -> Vec<(String, String)> {
let re = Regex::new(r"(?i)<h[1-6].*?>").unwrap();
let mut chunks = Vec::new();
let mut current_chunk = (String::new(), String::new());
let mut last_end = 0;
let mut short_chunk: Option<(String, String)> = None;

for cap in re.find_iter(html) {
if last_end != cap.start() {
current_chunk.1.push_str(&html[last_end..cap.start()]);
}

if !current_chunk.1.is_empty() {
if !current_chunk.1.is_empty() && current_chunk.0 != current_chunk.1 {
current_chunk.1 = current_chunk.1.trim().to_string();
chunks.push(current_chunk);

current_chunk = (String::new(), String::new());
if let Some(prev_short_chunk) = short_chunk.take() {
current_chunk.1 = format!("{} {}", prev_short_chunk.1, current_chunk.1);
current_chunk.0 = prev_short_chunk.0;
}

if convert_html_to_text(&current_chunk.1)
.split_whitespace()
.count()
> 5
{
chunks.push(current_chunk);
current_chunk = (String::new(), String::new());
} else {
short_chunk = Some(current_chunk);
current_chunk = (String::new(), String::new());
}
}

current_chunk.0 = cap.as_str().to_string();
if current_chunk.0.is_empty() {
current_chunk.0 = cap.as_str().to_string();
}
current_chunk.1 = cap.as_str().to_string();
last_end = cap.end();
}
Expand All @@ -549,7 +560,15 @@ pub fn chunk_html(html: &str) -> Vec<(String, String)> {

if !current_chunk.1.is_empty() {
current_chunk.1 = current_chunk.1.trim().to_string();

if let Some(prev_short_chunk) = short_chunk.take() {
current_chunk.1 = format!("{} {}", prev_short_chunk.1, current_chunk.1);
current_chunk.0 = prev_short_chunk.0;
}

chunks.push(current_chunk);
} else if let Some(last_short_chunk) = short_chunk {
chunks.push(last_short_chunk);
}

chunks
Expand Down

0 comments on commit 42b5540

Please sign in to comment.