use crate::message::{Message, MessageError}; use crate::stores::{IMessageSearcher, MessageStoreError}; use log::{error, info}; use std::cmp; use std::collections::HashSet; use std::fs; use std::panic; use std::path::PathBuf; use std::string::ToString; use tantivy::collector::{TopDocs, Count}; use tantivy::directory::MmapDirectory; use tantivy::query::{AllQuery, BooleanQuery, FuzzyTermQuery, Occur, Query, TermQuery, RegexQuery}; use tantivy::schema::*; const BYTES_IN_MB: usize = 1024 * 1024; pub type TantivyMessage = Message; pub trait TantivyFrom { fn from_tantivy(doc: Document, schema: &EmailSchema) -> Result; } impl TantivyFrom for TantivyMessage { fn from_tantivy(doc: Document, schema: &EmailSchema) -> Result { let original: Result, _> = match doc .get_first(schema.original) .expect("Unable to get original message") { Value::Bytes(b) => Ok(b.clone()), _ => Err("Missing original email from the index"), }; let _tags: HashSet = doc .get_all(schema.tag) .into_iter() .filter_map(|s| s.text()) .map(String::from) .collect(); TantivyMessage::from_data( original.map_err(|_| MessageError::from("Could not read original from index"))?, ) } } pub struct EmailSchema { schema: Schema, subject: Field, body: Field, from: Field, recipients: Field, #[allow(dead_code)] thread: Field, id: Field, date: Field, tag: Field, original: Field, } impl Default for EmailSchema { fn default() -> EmailSchema { let mut schema_builder = SchemaBuilder::default(); let subject = schema_builder.add_text_field("subject", TEXT | STORED); let body = schema_builder.add_text_field("body", TEXT | STORED); let from = schema_builder.add_text_field("from", TEXT | STORED); let recipients = schema_builder.add_text_field("recipients", TEXT | STORED); let thread = schema_builder.add_text_field("thread", STRING); let id = schema_builder.add_text_field("id", STRING | STORED); let tag = schema_builder.add_text_field("tag", STRING | STORED); let dateoptions = IntOptions::default() .set_fast(Cardinality::SingleValue) .set_stored() .set_indexed(); let date = schema_builder.add_u64_field("date", dateoptions); let original = schema_builder.add_text_field("original", TEXT | STORED); let schema = schema_builder.build(); EmailSchema { schema, subject, body, from, recipients, thread, id, date, tag, original, } } } impl EmailSchema { pub fn _new() -> EmailSchema { EmailSchema::default() } } pub struct TantivyStore { email: EmailSchema, index: tantivy::Index, reader: tantivy::IndexReader, writer: Option, threads: Option, mem_per_thread: Option, } impl IMessageSearcher for TantivyStore { fn start_indexing_process(&mut self, num: usize) -> Result<(), MessageStoreError> { if self.writer.is_none() { let writer = self.get_index_writer(num)?; self.writer = Some(writer); } Ok(()) } fn finish_indexing_process(&mut self) -> Result<(), MessageStoreError> { let writer = &mut self.writer; match writer { Some(writer) => match writer.commit() { Ok(_) => Ok(()), Err(_) => Err(MessageStoreError::CouldNotAddMessage( "Failed to commit to index".to_string(), )), }, None => Err(MessageStoreError::CouldNotAddMessage( "Trying to commit index without an actual index".to_string(), )), } } fn add_message( &mut self, msg: Message, parsed_body: String, ) -> Result { self._add_message(msg, parsed_body) } fn search_fuzzy( &self, query: String, num: usize, ) -> Result, MessageStoreError> { Ok(self.search(query.as_str(), num)) } fn delete_message(&mut self, _msg: &Message) -> Result<(), MessageStoreError> { Ok(()) } fn update_message(&mut self, _msg: Message) -> Result { unimplemented!(); } fn latest(&mut self, num: usize) -> Result, MessageStoreError> { self._latest(num, None) } fn get_message(&self, id: String) -> Result, MessageStoreError> { Ok(self._get_message(id.as_ref())) } fn get_messages_page( &self, start: usize, num: usize, ) -> Result, MessageStoreError> { self._latest(num, Some(start)) } } impl TantivyStore { pub fn new(path: PathBuf) -> Self { Self::_new(path, false) } fn _new(path: PathBuf, _ro: bool) -> Self { let email = EmailSchema::default(); let index = TantivyStore::open_or_create_index(path, email.schema.clone()); let reader = index .reader() .expect("Unable to create an index reader for this index. Is the index corrupted?"); TantivyStore { index, reader, writer: None, email, threads: None, mem_per_thread: None, } } pub fn _new_ro(path: PathBuf) -> Self { Self::_new(path, true) } fn open_or_create(path: PathBuf, schema: Schema) -> tantivy::Index { tantivy::Index::open_or_create(MmapDirectory::open(path).unwrap(), schema).unwrap() } fn open_or_create_index(path: PathBuf, schema: Schema) -> tantivy::Index { fs::create_dir_all(path.as_path()).unwrap_or_else(|_| { panic!( "Unable to create or access the given index directory {}", path.to_str().unwrap() ) }); TantivyStore::open_or_create(path, schema) } fn _add_message( &mut self, msg: Message, parsed_body: String, ) -> Result { let writer = &mut self.writer; match writer { Some(indexer) => { let mut document = Document::new(); let email = &self.email; document.add_text(email.subject, msg.subject.as_str()); document.add_text(email.id, msg.id.as_str()); document.add_text(email.body, parsed_body.as_str()); document.add_text(email.from, msg.from.as_str()); document.add_text(email.recipients, msg.recipients.join(", ").as_str()); document.add_bytes(email.original, msg.original); document.add_u64(email.date, msg.date); msg.tags .iter() .for_each(|t| document.add_text(email.tag, t.as_str())); indexer.add_document(document); indexer.commit(); Ok(msg.id) } None => Err(MessageStoreError::CouldNotAddMessage( "No indexer was allocated".to_string(), )), } } fn _delete_message(&mut self, msg: &Message) -> Result<(), MessageStoreError> { let writer = &mut self.writer; match writer { Some(indexer) => { let term = Term::from_field_text(self.email.id, msg.id.as_ref()); indexer.delete_term(term); Ok(()) } None => Err(MessageStoreError::CouldNotModifyMessage( "Can't delete the message. The index writer is not initialized".to_string(), )), } } pub fn _tag_doc(&self, doc: Document, _tags: Vec) -> Result<(), tantivy::TantivyError> { let mut writer = self.get_index_writer(1).ok().unwrap(); let id = TantivyMessage::from_tantivy(doc, &self.email) .map_err(|_| { tantivy::TantivyError::SystemError(String::from("Can't read message from index")) })? .id; let term = Term::from_field_text(self.email.id, id.as_ref()); writer.delete_term(term); writer.commit()?; self.reader.reload() } fn get_index_writer( &self, num_emails: usize, ) -> Result { let num_cpu = match self.threads { Some(threads) => threads, None => cmp::min( (num_cpus::get() as f32 / 1.5).floor() as usize, cmp::max(1, (0.0818598 * (num_emails as f32).powf(0.311549)) as usize), ), }; let mem_per_thread = match self.mem_per_thread { Some(mem) => mem * BYTES_IN_MB, None => { (if let Ok(mem_info) = sys_info::mem_info() { cmp::min( cmp::min( mem_info.avail as usize * 1024 / (num_cpu + 1), cmp::max( (0.41268337 * (num_emails as f32).powf(0.672702)) as usize * BYTES_IN_MB, 200 * BYTES_IN_MB, ), ), 2000 * BYTES_IN_MB, ) } else { 400 * BYTES_IN_MB }) as usize } }; info!( "We're using {} threads with {}mb memory per thread", num_cpu, mem_per_thread / BYTES_IN_MB ); match self .index .writer_with_num_threads(num_cpu, mem_per_thread * num_cpu) { Ok(index_writer) => Ok(index_writer), Err(_) => Err(MessageStoreError::CouldNotAddMessage( "Impossible to create the indexer".to_string(), )), } } pub fn _latest( &self, num: usize, _skip: Option, ) -> Result, MessageStoreError> { let searcher = self.reader.searcher(); let skip = _skip.unwrap_or(0); let docs = searcher .search( &AllQuery, &TopDocs::with_limit(num).and_offset(skip).order_by_u64_field(self.email.date), ) .map_err(|_| MessageStoreError::CouldNotGetMessages(vec![]))?; let mut ret = vec![]; for doc in docs { let retrieved_doc = searcher.doc(doc.1).unwrap(); ret.push( TantivyMessage::from_tantivy(retrieved_doc, &self.email).map_err(|_| { MessageStoreError::CouldNotGetMessage("Message is corrupt".to_string()) })?, ); } Ok(ret) } pub fn get_doc(&self, id: &str) -> Result { // Is this needed? self.reader.load_searchers()?; let searcher = self.reader.searcher(); let termq = RegexQuery::from_pattern(format!("{}.*",id).as_str(), self.email.id)?; let res = searcher.search(&termq, &(TopDocs::with_limit(1), Count)); match res { Ok((doc, count)) => match doc.first() { Some((_score, doc_address)) => searcher.doc(*doc_address), None => { error!("Got count {:}", count); Err(tantivy::TantivyError::InvalidArgument( "Document not found".to_string(), )) }, }, Err(e) => Err(e), } } pub fn _get_message(&self, id: &str) -> Option { let doc = self.get_doc(id); match doc { Ok(doc) => TantivyMessage::from_tantivy(doc, &self.email).ok(), _ => None, } } pub fn search(&self, text: &str, num: usize) -> Vec { let searcher = self.reader.searcher(); let term = Term::from_field_text(self.email.subject, text); let term_body = Term::from_field_text(self.email.body, text); let top_docs_by_date = TopDocs::with_limit(num).order_by_u64_field(self.email.date); let bquery = BooleanQuery::new_multiterms_query(vec![term, term_body]); let top_docs = searcher.search(&bquery, &top_docs_by_date).unwrap(); let mut ret = vec![]; for doc in top_docs { let retrieved_doc = searcher.doc(doc.1).unwrap(); if let Ok(d) = TantivyMessage::from_tantivy(retrieved_doc, &self.email) { ret.push(d); } } ret } #[allow(dead_code)] pub fn fuzzy(&self, text: &str, num: usize) -> Vec { let mut terms = text.split(' ').collect::>(); terms.insert(0, text); let searcher = self.reader.searcher(); let mut ret = self.search(text, num); for n in 1..2 { if ret.len() < num { let mut queries: Vec<(Occur, Box)> = vec![]; for (_, t) in terms.iter().enumerate() { let term = Term::from_field_text(self.email.subject, t); let term_body = Term::from_field_text(self.email.body, t); let query = FuzzyTermQuery::new(term, n, true); let query_body = FuzzyTermQuery::new(term_body, n, true); queries.push((Occur::Should, Box::new(query))); queries.push((Occur::Should, Box::new(query_body))); } let top_docs_by_date = TopDocs::with_limit(num).order_by_u64_field(self.email.date); let bquery = BooleanQuery::from(queries); let top_docs = searcher.search(&bquery, &top_docs_by_date).unwrap(); for doc in top_docs { let retrieved_doc = searcher.doc(doc.1).unwrap(); if let Ok(d) = TantivyMessage::from_tantivy(retrieved_doc, &self.email) { ret.push(d); } } } } ret } }