1 files changed, 700 insertions, 0 deletions
diff --git a/lurker/index/Index.cpp b/lurker/index/Index.cpp
new file mode 100644
index 0000000..4426c50
--- /dev/null
+++ b/lurker/index/Index.cpp
@@ -0,0 +1,700 @@
+/*  $Id: Index.cpp 1649 2009-10-19 14:35:01Z terpstra $
+ *  
+ *  index.cpp - Insert all the keywords from the given email
+ *  
+ *  Copyright (C) 2002 - Wesley W. Terpstra
+ *  
+ *  License: GPL
+ *  
+ *  Authors: 'Wesley W. Terpstra' <wesley@terpstra.ca>
+ *  
+ *    This program is free software; you can redistribute it and/or modify
+ *    it under the terms of the GNU General Public License as published by
+ *    the Free Software Foundation; version 2.
+ *    
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU General Public License for more details.
+ *    
+ *    You should have received a copy of the GNU General Public License
+ *    along with this program; if not, write to the Free Software
+ *    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+ 
+#define _FILE_OFFSET_BITS 64
+
+#include <mimelib/headers.h>
+#include <mimelib/datetime.h>
+#include <mimelib/addrlist.h>
+#include <mimelib/address.h>
+#include <mimelib/group.h>
+#include <mimelib/mboxlist.h>
+#include <mimelib/mailbox.h>
+#include <mimelib/text.h>
+#include <mimelib/param.h>
+#include <mimelib/enum.h>
+#include <mimelib/body.h>
+#include <mimelib/bodypart.h>
+#include <mimelib/utility.h>
+
+#include <CharsetEscape.h>
+#include <Keys.h>
+#include <md5.h>
+#include <cstdlib>
+
+#include "Index.h"
+#include "Summary.h"
+
+#include <string>
+#include <vector>
+#include <iostream>
+
+#include <unistd.h>
+#include <iconv.h>
+#include <cerrno>
+
+using namespace std;
+
+#define MAX_MESSAGE_ID	80
+
+void utf8Truncate(string& str, string::size_type len)
+{
+	if (str.length() < len) return;
+	
+	// look for nasty utf-8 stuff that's dangling and crop it
+	while (len && ((unsigned char)str[len-1]) >= 0x80 && 
+	              ((unsigned char)str[len-1]) <= 0xBF)
+		--len;
+	// now rewind off potential utf-8 start bytes
+	while (len && ((unsigned char)str[len-1]) >= 0xC0)
+		--len;
+	
+	// len is now at the end of a complete multi-byte element or ascii
+	
+	str.resize(len);
+}
+
+static inline char my_toupper(char x)
+{
+	if (x >= 'a' && x <= 'z')
+		return x - 'a' + 'A';
+	else	return x;
+}
+
+static bool strings_equal_case_ignored(const string& a, const string& b)
+{
+	if (a.length() != b.length()) return false;
+	
+	for (string::size_type i = 0; i < a.length(); ++i)
+		if (my_toupper(a[i]) != my_toupper(b[i]))
+			return false;
+	return true;
+}
+
+// first = address, second = name
+pair<string, string> pickAddress(DwAddress* a, const char* charset)
+{
+	for (; a != 0; a = a->Next())
+	{
+		if (a->IsGroup())
+		{
+			DwGroup* g = dynamic_cast<DwGroup*>(a);
+			if (g)
+			{
+				pair<string, string> out = 
+					pickAddress(
+						g->MailboxList().FirstMailbox(),
+						charset);
+				if (out.first != "") return out;
+			}
+		}
+		else
+		{
+			DwMailbox* m = dynamic_cast<DwMailbox*>(a);
+			if (m)
+			{
+				string name = m->FullName().c_str();
+				name = decode_header(name, charset);
+				DwString addr = m->LocalPart() + "@" + m->Domain();
+				
+				// fucked address? (one cannot safely cut this)
+				if (addr.length() > 128 || 
+				    m->LocalPart() == "" || m->Domain() == "")
+				{
+					addr = "";
+				}
+				
+				for (size_t i = 0; i < addr.length(); ++i)
+				{
+					if (addr[i] <= 0x20 || addr[i] >= 0x7f)
+					{	// fucked up address
+						addr = "";
+						break;
+					}
+				}
+				
+				// prune any optional quotes
+				if (name.length() >= 2 && name[0] == '"')
+					name = name.substr(1, name.length()-2);
+				
+				if (addr != "")
+					return pair<string, string>(addr.c_str(), name);
+			}
+		}
+	}
+	
+	return pair<string, string>("", "");
+}
+
+int Index::index_author()
+{
+	// one always has headers, but not always this function:
+	// if (message.hasHeaders())
+	
+	charset = "ISO-8859-1"; // a good default as any
+	
+	if (message.Headers().HasContentType())
+	{
+		DwParameter* p = message.Headers().ContentType().FirstParameter();
+		while (p)
+		{
+			if (p->Attribute() == "charset")
+				charset = p->Value().c_str();
+			p = p->Next();
+		}
+	}
+	
+	// pickAddress only gives an author_name if it gave an author_email
+	
+	if (message.Headers().HasReplyTo())
+	{
+		pair<string, string> addr = pickAddress(
+			message.Headers().ReplyTo().FirstAddress(),
+			charset.c_str());
+		
+		author_email = addr.first;
+		author_name  = addr.second;
+		
+		// Some evil mailing lists set reply-to the list.
+		if (strings_equal_case_ignored(author_email, list.address))
+		{
+			author_email = "";
+			author_name = "";
+		}
+	}
+	
+	// Given a reply-to that is not the list, we allow the from to
+	// provide a fullname under the assumption it is the same person.
+	
+	if (message.Headers().HasFrom())
+	{
+		pair<string, string> addr = pickAddress(
+			message.Headers().From().FirstMailbox(),
+			charset.c_str());
+		
+		if (!author_email.length()) author_email = addr.first;
+		if (!author_name .length()) author_name  = addr.second;
+	}
+	
+	// ditto
+	
+	if (message.Headers().HasSender())
+	{
+		pair<string, string> addr = pickAddress(
+			&message.Headers().Sender(),
+			charset.c_str());
+		
+		if (!author_email.length()) author_email = addr.first;
+		if (!author_name .length()) author_name  = addr.second;
+	}
+	
+	author_name = whitespace_sanitize(author_name);
+	utf8Truncate(author_name, 100);
+	//  - nothing longer than 128 could get here (from above)
+	//  - one can never safely truncate an email address
+	// utf8Truncate(author_email, 100);
+	
+	return 0;
+}
+
+// Doesn't vary with charset
+inline bool lu_isspace(char x)
+{
+	return x == ' ' || x == '\n' || x == '\r' || x == '\t';
+}
+
+void build_message_hash(const char* str, unsigned char* hash)
+{
+	MD5Context ctx;
+	
+	MD5Init(&ctx);
+	MD5Update(&ctx, (const unsigned char*)str, strlen(str));
+	
+	unsigned char buf[16];
+	MD5Final(buf, &ctx);
+	
+	hash[0] = buf[0] ^ buf[4] ^ buf[ 8] ^ buf[12];
+	hash[1] = buf[1] ^ buf[5] ^ buf[ 9] ^ buf[13];
+	hash[2] = buf[2] ^ buf[6] ^ buf[10] ^ buf[14];
+	hash[3] = buf[3] ^ buf[7] ^ buf[11] ^ buf[15];
+}
+
+int feed_writer(const char* keyword, void* arg)
+{
+	Index* i = (Index*)arg;
+	
+	string x(LU_KEYWORD);
+	x += keyword;
+	x += '\0';
+	x += i->id.raw();
+	
+	return i->writer->insert(x);
+}
+
+int Index::index_id(bool userdate, time_t server, bool& exist)
+{
+	time_t stamp = server;
+	string messageId;
+	unsigned char hash[4];
+	
+	// if (message.hasHeaders())
+	
+	if (message.Headers().HasDate())
+	{
+		time_t user = message.Headers().Date().AsUnixTime();
+		
+		/* User time must be earlier; there is delivery delay!
+		 * However, more than 7 day delivery time is unlikely.
+		 */
+		if ((user <= server && server < user+7*60*60*24) ||
+		    userdate ||  // trusting the userdate?
+		    server <= 0) // server is on crack?
+			stamp = user;
+	}
+	
+	if (stamp <= 0)
+	{	// this is crazy; I don't care if they agree: it's wrong
+		stamp = 1; // liers all have timestamp 1970-01-01 00:00:01
+	}
+	
+	if (message.Headers().HasMessageId())
+	{
+		vector<string> ids = extract_message_ids(
+			message.Headers().MessageId().AsString().c_str());
+		
+		if (!ids.empty())
+			messageId = ids.front();
+	}
+	
+	if (messageId.length())
+	{
+		// Constant message-id across import, and threadable
+		build_message_hash(messageId.c_str(), hash);
+	}
+	else if (author_email.length())
+	{
+		// This means no proper threading.
+		// At least the message-id is constant across import.
+		build_message_hash(author_email.c_str(), hash);
+	}
+	else
+	{
+		// Can't make any guarantees; just import it.
+		hash[0] = random() % 256;
+		hash[1] = random() % 256;
+		hash[2] = random() % 256;
+		hash[3] = random() % 256;
+	}
+	
+	id = MessageId(stamp, hash);
+	if (blacklist.find(id) != blacklist.end())
+	{
+		// Messages marked as blacklisted use the 'exist' flag to
+		// avoid being imported into the database.
+		exist = true;
+		return 0;
+	}
+	
+	if (messageId.length())
+	{
+		// Raw message-id for threading
+		if (writer->insert(
+			LU_KEYWORD +
+			string(LU_KEYWORD_MESSAGE_ID) +
+			messageId +
+			'\0' + 
+			id.raw()) != 0)
+		{
+			cerr << "Failed to insert message id keyword!" << endl;
+			return -1;
+		}
+		
+		// digested message-id for user searches
+		if (my_keyword_digest_string(
+			messageId.c_str(), messageId.length(),
+			LU_KEYWORD_MESSAGE_ID, &feed_writer, this, 0) != 0)
+		{
+			cerr << "Failed to index message-id" << endl;
+			return -1;
+		}
+	}
+	
+	if (writer->insert(
+		LU_KEYWORD +
+		string(LU_KEYWORD_EVERYTHING) + 
+		'\0' + 
+		id.raw()) != 0)
+	{
+		cerr << "Failed to the any keyword!" << endl;
+		return -1;
+	}
+	
+	return 0;
+}
+
+int Index::index_summary(bool check, bool& exist)
+{
+	string prefix = LU_SUMMARY + id.raw();
+	
+	if (message.Headers().HasSubject())
+	{
+		subject = message.Headers().Subject().AsString().c_str();
+		subject = decode_header(subject, charset.c_str());
+	}
+	
+	if (subject == "")
+		subject = "[...]";
+	
+	string mbox = prefix + LU_MESSAGE_MBOX + list.mbox + '\0';
+	
+	if (check)
+	{
+		// Check for existance
+		auto_ptr<ESort::Walker> w(writer->seek(mbox, "", ESort::Forward));
+		
+		if (w->advance() == -1)
+		{	// was it just eof?
+			if (errno != 0) return -1;
+		}
+		else
+		{	// if it suceeded. then ... it is already in there
+			exist = true;
+			return 0;
+		}
+	}
+	
+	unsigned char buf[12];
+	off_t o = off;
+	long l = len;
+	int i;
+	
+	for (i = 7; i >= 0; --i)
+	{
+		buf[i] = (o & 0xFF);
+		o >>= 8;
+	}
+	for (i = 11; i >= 8; --i)
+	{
+		buf[i] = (l & 0xFF);
+		l >>= 8;
+	}
+	
+	// Don't let crazy stuff in there.
+	utf8Truncate(subject, 200);
+	
+	if (writer->insert(prefix + LU_MESSAGE_AUTHOR_EMAIL + author_email) != 0 ||
+	    writer->insert(prefix + LU_MESSAGE_AUTHOR_NAME  + author_name)  != 0 ||
+	    writer->insert(prefix + LU_MESSAGE_SUBJECT      + subject)      != 0 ||
+	    writer->insert(mbox + string((char*)buf, 12)) != 0)
+	{
+		cerr << "Failed to insert summary keys" << endl;
+		return -1;
+	}
+	
+	return 0;
+}
+
+int Index::index_threading()
+{
+	string shash = subject_hash(subject.c_str());
+	string suffix;
+	
+	unsigned char hash[4];
+	
+	if (writer->insert(
+		LU_KEYWORD
+		LU_KEYWORD_THREAD + 
+		shash + 
+		'\0' + 
+		id.raw()) != 0)
+	{
+		cerr << "Failed to insert threading keyword" << endl;
+		return -1;
+	}
+	
+	// if (message.hasHeaders())
+	
+	if (message.Headers().HasInReplyTo())
+	{
+		vector<string> ids = extract_message_ids(
+			message.Headers().InReplyTo().AsString().c_str());
+		
+		// first in-reply-to is most relevant
+		for (vector<string>::iterator i = ids.begin(); i != ids.end(); ++i)
+		{
+			build_message_hash(i->c_str(), hash);
+			
+			// keep it reasonable; too many reply-tos is bad
+			if (suffix.length() < 200)
+				suffix.append((const char*)hash, 4);
+		}
+	}
+	
+	if (message.Headers().HasReferences())
+	{
+		vector<string> ids = extract_message_ids(
+			message.Headers().References().AsString().c_str());
+		
+		// last references is most recently added (most likely irt)
+		for (vector<string>::reverse_iterator i = ids.rbegin(); 
+		     i != ids.rend(); ++i)
+		{
+			build_message_hash(i->c_str(), hash);
+			// keep it reasonable; too many reply-tos is bad
+			if (suffix.length() < 200)
+				suffix.append((const char*)hash, 4);
+		}
+	}
+	
+	if (writer->insert(
+		LU_THREADING
+		+ shash
+		+ id.raw()
+		+ suffix) != 0)
+	{
+		cerr << "Failed to insert threading keys" << endl;
+		return -1;
+	}
+	
+	if (writer->insert(
+		LU_NEW_TOPICS
+		+ list.mbox + '\0'
+		+ id.raw().substr(0, 4)
+		+ shash) != 0)
+	{
+		cerr << "Failed to insert new topics keys" << endl;
+		return -1;
+	}
+	
+	return 0;
+}
+
+int Index::index_control(time_t import)
+{
+	bool ok = true;
+	if (writer->insert(
+		LU_KEYWORD 
+		LU_KEYWORD_LIST +
+		list.mbox + 
+		'\0' + 
+		id.raw()) != 0) ok = false;
+	
+	/* emulated group and language searches are impossibly slow.
+	 * these keywords are a must for large archives.
+	 */
+	if (writer->insert(
+		LU_KEYWORD
+		LU_KEYWORD_GROUP +
+		list.group +
+		'\0' +
+		id.raw()) != 0) ok = false;
+	
+	set<string>::const_iterator i, e;
+	for (i = list.languages.begin(), e = list.languages.end(); i != e; ++i)
+		if (writer->insert(
+			LU_KEYWORD
+			LU_KEYWORD_LANGUAGE +
+			*i +
+			'\0' +
+			id.raw()) != 0) ok = false;
+	
+	MessageId importStamp(import);
+	if (writer->insert(
+		LU_CACHE +
+		importStamp.raw().substr(0, 4) +
+		id.raw()) != 0) ok = false;
+	
+	if (author_email.length())
+	{
+		if (my_keyword_digest_string(
+			author_email.c_str(), author_email.length(),
+			LU_KEYWORD_AUTHOR, &feed_writer, this, 1) != 0)
+			ok = false;
+	}
+	
+	if (author_name.length())
+	{
+		if (my_keyword_digest_string(
+			author_name.c_str(), author_name.length(),
+			LU_KEYWORD_AUTHOR, &feed_writer, this, 1) != 0)
+			ok = false;
+	}
+	
+	if (subject.length())
+	{
+		if (my_keyword_digest_string(
+			subject.c_str(), subject.length(),
+			LU_KEYWORD_SUBJECT, &feed_writer, this, 1) != 0)
+			ok = false;
+	}
+	
+	if (message.Headers().HasInReplyTo())
+	{
+		vector<string> ids = extract_message_ids(
+			message.Headers().InReplyTo().AsString().c_str());
+		for (vector<string>::iterator i = ids.begin(); i != ids.end(); ++i)
+			if (writer->insert(
+				LU_KEYWORD
+				LU_KEYWORD_REPLY_TO +
+				*i + '\0' + id.raw()) != 0)
+				ok = false;
+	}
+	
+#if 0	// this is questionable...
+	if (message.Headers().HasReferences())
+	{
+		vector<string> ids = extract_message_ids(
+			message.Headers().References().AsString().c_str());
+		for (vector<string>::iterator i = ids.begin(); i != ids.end(); ++i)
+			if (writer->insert(
+				LU_KEYWORD
+				LU_KEYWORD_REPLY_TO +
+				*i + '\0' + id.raw()) != 0)
+				ok = false;
+	}
+#endif
+	
+	if (!ok)
+	{
+		cerr << "Failed to insert control keys" << endl;
+		return -1;
+	}
+	
+	return 0;
+}
+
+int Index::index_entity(DwEntity& e, const string& charset)
+{
+	DwString text;
+	if (e.Headers().HasContentTransferEncoding())
+	{
+		switch (e.Headers().ContentTransferEncoding().AsEnum())
+		{
+		case DwMime::kCteQuotedPrintable:
+			DwDecodeQuotedPrintable(e.Body().AsString(), text);
+			break;
+		
+		case DwMime::kCteBase64:
+			DwDecodeBase64(e.Body().AsString(), text);
+			break;
+		
+		case DwMime::kCteNull:
+		case DwMime::kCteUnknown:
+		case DwMime::kCte7bit:
+		case DwMime::kCte8bit:
+		case DwMime::kCteBinary:
+			text = e.Body().AsString();
+			break;
+		}
+	}
+	else
+	{
+		text = e.Body().AsString();
+	}
+	
+	CharsetEscape decode(charset.c_str());
+	string utf8 = decode.write(text.c_str(), text.length());
+	
+	if (my_keyword_digest_string(
+		utf8.c_str(), utf8.length(),
+		LU_KEYWORD_WORD, &feed_writer, this, 1) != 0)
+	{
+		cerr << "Failed to index un-typed segment" << endl;
+		return -1;
+	}
+	
+	return 0;
+}
+
+int Index::index_keywords(DwEntity& e, const string& parentCharset)
+{
+	string charset = parentCharset;
+	
+	if (e.Headers().HasContentType())
+	{
+		DwMediaType& mt = e.Headers().ContentType();
+		
+		for (DwParameter* p = mt.FirstParameter(); p; p = p->Next())
+		{
+			DwString attr = p->Attribute();
+			attr.ConvertToLowerCase(); // case insens
+			if (attr == "charset") charset = p->Value().c_str();
+		}
+	}
+	
+	// if (e.hasHeaders() && 
+	if (e.Headers().HasContentType())
+	{
+		DwMediaType& t = e.Headers().ContentType();
+		switch (t.Type())
+		{
+		case DwMime::kTypeMessage:
+			if (e.Body().Message()) 
+				index_keywords(*e.Body().Message(), charset);
+			break;
+			
+		case DwMime::kTypeMultipart:
+			// index all alternatives in multipart
+			for (DwBodyPart* p = e.Body().FirstBodyPart(); p != 0; p = p->Next())
+				index_keywords(*p, charset);
+			break;
+			
+		case DwMime::kTypeText:
+			if (t.Subtype() == DwMime::kSubtypePlain)
+			{
+				if (index_entity(e, charset) != 0) return -1;
+			}
+			break;
+		}
+	}
+	else
+	{
+		if (index_entity(e, charset) != 0) return -1;
+	}
+	
+	return 0;
+}
+
+int Index::index(bool userdate, time_t envelope, time_t import, bool check, bool& exist)
+{
+	exist = false;
+	
+//	cout << message.Headers().Subject().AsString().c_str() << endl;
+	
+	if (index_author() < 0) return -1;
+	
+	/* If the message is blacklisted, we mark it as 'existing' */
+	if (index_id(userdate, envelope, exist) < 0) return -1;
+	if (exist) return 0;
+	
+	/* If the message is already imported, mark it as 'existing' */
+	if (index_summary(check, exist) < 0) return -1;
+	if (exist) return 0;
+	
+	if (index_threading(      )                < 0) return -1;
+	if (index_control  (import)                < 0) return -1;
+	if (index_keywords (message, "ISO-8859-1") < 0) return -1;
+	
+	return 0;
+}