summaryrefslogtreecommitdiffstats
path: root/lurker/index/Index.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'lurker/index/Index.cpp')
-rw-r--r--lurker/index/Index.cpp700
1 files changed, 700 insertions, 0 deletions
diff --git a/lurker/index/Index.cpp b/lurker/index/Index.cpp
new file mode 100644
index 0000000..4426c50
--- /dev/null
+++ b/lurker/index/Index.cpp
@@ -0,0 +1,700 @@
+/* $Id: Index.cpp 1649 2009-10-19 14:35:01Z terpstra $
+ *
+ * index.cpp - Insert all the keywords from the given email
+ *
+ * Copyright (C) 2002 - Wesley W. Terpstra
+ *
+ * License: GPL
+ *
+ * Authors: 'Wesley W. Terpstra' <wesley@terpstra.ca>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#define _FILE_OFFSET_BITS 64
+
+#include <mimelib/headers.h>
+#include <mimelib/datetime.h>
+#include <mimelib/addrlist.h>
+#include <mimelib/address.h>
+#include <mimelib/group.h>
+#include <mimelib/mboxlist.h>
+#include <mimelib/mailbox.h>
+#include <mimelib/text.h>
+#include <mimelib/param.h>
+#include <mimelib/enum.h>
+#include <mimelib/body.h>
+#include <mimelib/bodypart.h>
+#include <mimelib/utility.h>
+
+#include <CharsetEscape.h>
+#include <Keys.h>
+#include <md5.h>
+#include <cstdlib>
+
+#include "Index.h"
+#include "Summary.h"
+
+#include <string>
+#include <vector>
+#include <iostream>
+
+#include <unistd.h>
+#include <iconv.h>
+#include <cerrno>
+
+using namespace std;
+
+#define MAX_MESSAGE_ID 80
+
+void utf8Truncate(string& str, string::size_type len)
+{
+ if (str.length() < len) return;
+
+ // look for nasty utf-8 stuff that's dangling and crop it
+ while (len && ((unsigned char)str[len-1]) >= 0x80 &&
+ ((unsigned char)str[len-1]) <= 0xBF)
+ --len;
+ // now rewind off potential utf-8 start bytes
+ while (len && ((unsigned char)str[len-1]) >= 0xC0)
+ --len;
+
+ // len is now at the end of a complete multi-byte element or ascii
+
+ str.resize(len);
+}
+
+static inline char my_toupper(char x)
+{
+ if (x >= 'a' && x <= 'z')
+ return x - 'a' + 'A';
+ else return x;
+}
+
+static bool strings_equal_case_ignored(const string& a, const string& b)
+{
+ if (a.length() != b.length()) return false;
+
+ for (string::size_type i = 0; i < a.length(); ++i)
+ if (my_toupper(a[i]) != my_toupper(b[i]))
+ return false;
+ return true;
+}
+
+// first = address, second = name
+pair<string, string> pickAddress(DwAddress* a, const char* charset)
+{
+ for (; a != 0; a = a->Next())
+ {
+ if (a->IsGroup())
+ {
+ DwGroup* g = dynamic_cast<DwGroup*>(a);
+ if (g)
+ {
+ pair<string, string> out =
+ pickAddress(
+ g->MailboxList().FirstMailbox(),
+ charset);
+ if (out.first != "") return out;
+ }
+ }
+ else
+ {
+ DwMailbox* m = dynamic_cast<DwMailbox*>(a);
+ if (m)
+ {
+ string name = m->FullName().c_str();
+ name = decode_header(name, charset);
+ DwString addr = m->LocalPart() + "@" + m->Domain();
+
+ // fucked address? (one cannot safely cut this)
+ if (addr.length() > 128 ||
+ m->LocalPart() == "" || m->Domain() == "")
+ {
+ addr = "";
+ }
+
+ for (size_t i = 0; i < addr.length(); ++i)
+ {
+ if (addr[i] <= 0x20 || addr[i] >= 0x7f)
+ { // fucked up address
+ addr = "";
+ break;
+ }
+ }
+
+ // prune any optional quotes
+ if (name.length() >= 2 && name[0] == '"')
+ name = name.substr(1, name.length()-2);
+
+ if (addr != "")
+ return pair<string, string>(addr.c_str(), name);
+ }
+ }
+ }
+
+ return pair<string, string>("", "");
+}
+
+int Index::index_author()
+{
+ // one always has headers, but not always this function:
+ // if (message.hasHeaders())
+
+ charset = "ISO-8859-1"; // a good default as any
+
+ if (message.Headers().HasContentType())
+ {
+ DwParameter* p = message.Headers().ContentType().FirstParameter();
+ while (p)
+ {
+ if (p->Attribute() == "charset")
+ charset = p->Value().c_str();
+ p = p->Next();
+ }
+ }
+
+ // pickAddress only gives an author_name if it gave an author_email
+
+ if (message.Headers().HasReplyTo())
+ {
+ pair<string, string> addr = pickAddress(
+ message.Headers().ReplyTo().FirstAddress(),
+ charset.c_str());
+
+ author_email = addr.first;
+ author_name = addr.second;
+
+ // Some evil mailing lists set reply-to the list.
+ if (strings_equal_case_ignored(author_email, list.address))
+ {
+ author_email = "";
+ author_name = "";
+ }
+ }
+
+ // Given a reply-to that is not the list, we allow the from to
+ // provide a fullname under the assumption it is the same person.
+
+ if (message.Headers().HasFrom())
+ {
+ pair<string, string> addr = pickAddress(
+ message.Headers().From().FirstMailbox(),
+ charset.c_str());
+
+ if (!author_email.length()) author_email = addr.first;
+ if (!author_name .length()) author_name = addr.second;
+ }
+
+ // ditto
+
+ if (message.Headers().HasSender())
+ {
+ pair<string, string> addr = pickAddress(
+ &message.Headers().Sender(),
+ charset.c_str());
+
+ if (!author_email.length()) author_email = addr.first;
+ if (!author_name .length()) author_name = addr.second;
+ }
+
+ author_name = whitespace_sanitize(author_name);
+ utf8Truncate(author_name, 100);
+ // - nothing longer than 128 could get here (from above)
+ // - one can never safely truncate an email address
+ // utf8Truncate(author_email, 100);
+
+ return 0;
+}
+
+// Doesn't vary with charset
+inline bool lu_isspace(char x)
+{
+ return x == ' ' || x == '\n' || x == '\r' || x == '\t';
+}
+
+void build_message_hash(const char* str, unsigned char* hash)
+{
+ MD5Context ctx;
+
+ MD5Init(&ctx);
+ MD5Update(&ctx, (const unsigned char*)str, strlen(str));
+
+ unsigned char buf[16];
+ MD5Final(buf, &ctx);
+
+ hash[0] = buf[0] ^ buf[4] ^ buf[ 8] ^ buf[12];
+ hash[1] = buf[1] ^ buf[5] ^ buf[ 9] ^ buf[13];
+ hash[2] = buf[2] ^ buf[6] ^ buf[10] ^ buf[14];
+ hash[3] = buf[3] ^ buf[7] ^ buf[11] ^ buf[15];
+}
+
+int feed_writer(const char* keyword, void* arg)
+{
+ Index* i = (Index*)arg;
+
+ string x(LU_KEYWORD);
+ x += keyword;
+ x += '\0';
+ x += i->id.raw();
+
+ return i->writer->insert(x);
+}
+
+int Index::index_id(bool userdate, time_t server, bool& exist)
+{
+ time_t stamp = server;
+ string messageId;
+ unsigned char hash[4];
+
+ // if (message.hasHeaders())
+
+ if (message.Headers().HasDate())
+ {
+ time_t user = message.Headers().Date().AsUnixTime();
+
+ /* User time must be earlier; there is delivery delay!
+ * However, more than 7 day delivery time is unlikely.
+ */
+ if ((user <= server && server < user+7*60*60*24) ||
+ userdate || // trusting the userdate?
+ server <= 0) // server is on crack?
+ stamp = user;
+ }
+
+ if (stamp <= 0)
+ { // this is crazy; I don't care if they agree: it's wrong
+ stamp = 1; // liers all have timestamp 1970-01-01 00:00:01
+ }
+
+ if (message.Headers().HasMessageId())
+ {
+ vector<string> ids = extract_message_ids(
+ message.Headers().MessageId().AsString().c_str());
+
+ if (!ids.empty())
+ messageId = ids.front();
+ }
+
+ if (messageId.length())
+ {
+ // Constant message-id across import, and threadable
+ build_message_hash(messageId.c_str(), hash);
+ }
+ else if (author_email.length())
+ {
+ // This means no proper threading.
+ // At least the message-id is constant across import.
+ build_message_hash(author_email.c_str(), hash);
+ }
+ else
+ {
+ // Can't make any guarantees; just import it.
+ hash[0] = random() % 256;
+ hash[1] = random() % 256;
+ hash[2] = random() % 256;
+ hash[3] = random() % 256;
+ }
+
+ id = MessageId(stamp, hash);
+ if (blacklist.find(id) != blacklist.end())
+ {
+ // Messages marked as blacklisted use the 'exist' flag to
+ // avoid being imported into the database.
+ exist = true;
+ return 0;
+ }
+
+ if (messageId.length())
+ {
+ // Raw message-id for threading
+ if (writer->insert(
+ LU_KEYWORD +
+ string(LU_KEYWORD_MESSAGE_ID) +
+ messageId +
+ '\0' +
+ id.raw()) != 0)
+ {
+ cerr << "Failed to insert message id keyword!" << endl;
+ return -1;
+ }
+
+ // digested message-id for user searches
+ if (my_keyword_digest_string(
+ messageId.c_str(), messageId.length(),
+ LU_KEYWORD_MESSAGE_ID, &feed_writer, this, 0) != 0)
+ {
+ cerr << "Failed to index message-id" << endl;
+ return -1;
+ }
+ }
+
+ if (writer->insert(
+ LU_KEYWORD +
+ string(LU_KEYWORD_EVERYTHING) +
+ '\0' +
+ id.raw()) != 0)
+ {
+ cerr << "Failed to the any keyword!" << endl;
+ return -1;
+ }
+
+ return 0;
+}
+
+int Index::index_summary(bool check, bool& exist)
+{
+ string prefix = LU_SUMMARY + id.raw();
+
+ if (message.Headers().HasSubject())
+ {
+ subject = message.Headers().Subject().AsString().c_str();
+ subject = decode_header(subject, charset.c_str());
+ }
+
+ if (subject == "")
+ subject = "[...]";
+
+ string mbox = prefix + LU_MESSAGE_MBOX + list.mbox + '\0';
+
+ if (check)
+ {
+ // Check for existance
+ auto_ptr<ESort::Walker> w(writer->seek(mbox, "", ESort::Forward));
+
+ if (w->advance() == -1)
+ { // was it just eof?
+ if (errno != 0) return -1;
+ }
+ else
+ { // if it suceeded. then ... it is already in there
+ exist = true;
+ return 0;
+ }
+ }
+
+ unsigned char buf[12];
+ off_t o = off;
+ long l = len;
+ int i;
+
+ for (i = 7; i >= 0; --i)
+ {
+ buf[i] = (o & 0xFF);
+ o >>= 8;
+ }
+ for (i = 11; i >= 8; --i)
+ {
+ buf[i] = (l & 0xFF);
+ l >>= 8;
+ }
+
+ // Don't let crazy stuff in there.
+ utf8Truncate(subject, 200);
+
+ if (writer->insert(prefix + LU_MESSAGE_AUTHOR_EMAIL + author_email) != 0 ||
+ writer->insert(prefix + LU_MESSAGE_AUTHOR_NAME + author_name) != 0 ||
+ writer->insert(prefix + LU_MESSAGE_SUBJECT + subject) != 0 ||
+ writer->insert(mbox + string((char*)buf, 12)) != 0)
+ {
+ cerr << "Failed to insert summary keys" << endl;
+ return -1;
+ }
+
+ return 0;
+}
+
+int Index::index_threading()
+{
+ string shash = subject_hash(subject.c_str());
+ string suffix;
+
+ unsigned char hash[4];
+
+ if (writer->insert(
+ LU_KEYWORD
+ LU_KEYWORD_THREAD +
+ shash +
+ '\0' +
+ id.raw()) != 0)
+ {
+ cerr << "Failed to insert threading keyword" << endl;
+ return -1;
+ }
+
+ // if (message.hasHeaders())
+
+ if (message.Headers().HasInReplyTo())
+ {
+ vector<string> ids = extract_message_ids(
+ message.Headers().InReplyTo().AsString().c_str());
+
+ // first in-reply-to is most relevant
+ for (vector<string>::iterator i = ids.begin(); i != ids.end(); ++i)
+ {
+ build_message_hash(i->c_str(), hash);
+
+ // keep it reasonable; too many reply-tos is bad
+ if (suffix.length() < 200)
+ suffix.append((const char*)hash, 4);
+ }
+ }
+
+ if (message.Headers().HasReferences())
+ {
+ vector<string> ids = extract_message_ids(
+ message.Headers().References().AsString().c_str());
+
+ // last references is most recently added (most likely irt)
+ for (vector<string>::reverse_iterator i = ids.rbegin();
+ i != ids.rend(); ++i)
+ {
+ build_message_hash(i->c_str(), hash);
+ // keep it reasonable; too many reply-tos is bad
+ if (suffix.length() < 200)
+ suffix.append((const char*)hash, 4);
+ }
+ }
+
+ if (writer->insert(
+ LU_THREADING
+ + shash
+ + id.raw()
+ + suffix) != 0)
+ {
+ cerr << "Failed to insert threading keys" << endl;
+ return -1;
+ }
+
+ if (writer->insert(
+ LU_NEW_TOPICS
+ + list.mbox + '\0'
+ + id.raw().substr(0, 4)
+ + shash) != 0)
+ {
+ cerr << "Failed to insert new topics keys" << endl;
+ return -1;
+ }
+
+ return 0;
+}
+
+int Index::index_control(time_t import)
+{
+ bool ok = true;
+ if (writer->insert(
+ LU_KEYWORD
+ LU_KEYWORD_LIST +
+ list.mbox +
+ '\0' +
+ id.raw()) != 0) ok = false;
+
+ /* emulated group and language searches are impossibly slow.
+ * these keywords are a must for large archives.
+ */
+ if (writer->insert(
+ LU_KEYWORD
+ LU_KEYWORD_GROUP +
+ list.group +
+ '\0' +
+ id.raw()) != 0) ok = false;
+
+ set<string>::const_iterator i, e;
+ for (i = list.languages.begin(), e = list.languages.end(); i != e; ++i)
+ if (writer->insert(
+ LU_KEYWORD
+ LU_KEYWORD_LANGUAGE +
+ *i +
+ '\0' +
+ id.raw()) != 0) ok = false;
+
+ MessageId importStamp(import);
+ if (writer->insert(
+ LU_CACHE +
+ importStamp.raw().substr(0, 4) +
+ id.raw()) != 0) ok = false;
+
+ if (author_email.length())
+ {
+ if (my_keyword_digest_string(
+ author_email.c_str(), author_email.length(),
+ LU_KEYWORD_AUTHOR, &feed_writer, this, 1) != 0)
+ ok = false;
+ }
+
+ if (author_name.length())
+ {
+ if (my_keyword_digest_string(
+ author_name.c_str(), author_name.length(),
+ LU_KEYWORD_AUTHOR, &feed_writer, this, 1) != 0)
+ ok = false;
+ }
+
+ if (subject.length())
+ {
+ if (my_keyword_digest_string(
+ subject.c_str(), subject.length(),
+ LU_KEYWORD_SUBJECT, &feed_writer, this, 1) != 0)
+ ok = false;
+ }
+
+ if (message.Headers().HasInReplyTo())
+ {
+ vector<string> ids = extract_message_ids(
+ message.Headers().InReplyTo().AsString().c_str());
+ for (vector<string>::iterator i = ids.begin(); i != ids.end(); ++i)
+ if (writer->insert(
+ LU_KEYWORD
+ LU_KEYWORD_REPLY_TO +
+ *i + '\0' + id.raw()) != 0)
+ ok = false;
+ }
+
+#if 0 // this is questionable...
+ if (message.Headers().HasReferences())
+ {
+ vector<string> ids = extract_message_ids(
+ message.Headers().References().AsString().c_str());
+ for (vector<string>::iterator i = ids.begin(); i != ids.end(); ++i)
+ if (writer->insert(
+ LU_KEYWORD
+ LU_KEYWORD_REPLY_TO +
+ *i + '\0' + id.raw()) != 0)
+ ok = false;
+ }
+#endif
+
+ if (!ok)
+ {
+ cerr << "Failed to insert control keys" << endl;
+ return -1;
+ }
+
+ return 0;
+}
+
+int Index::index_entity(DwEntity& e, const string& charset)
+{
+ DwString text;
+ if (e.Headers().HasContentTransferEncoding())
+ {
+ switch (e.Headers().ContentTransferEncoding().AsEnum())
+ {
+ case DwMime::kCteQuotedPrintable:
+ DwDecodeQuotedPrintable(e.Body().AsString(), text);
+ break;
+
+ case DwMime::kCteBase64:
+ DwDecodeBase64(e.Body().AsString(), text);
+ break;
+
+ case DwMime::kCteNull:
+ case DwMime::kCteUnknown:
+ case DwMime::kCte7bit:
+ case DwMime::kCte8bit:
+ case DwMime::kCteBinary:
+ text = e.Body().AsString();
+ break;
+ }
+ }
+ else
+ {
+ text = e.Body().AsString();
+ }
+
+ CharsetEscape decode(charset.c_str());
+ string utf8 = decode.write(text.c_str(), text.length());
+
+ if (my_keyword_digest_string(
+ utf8.c_str(), utf8.length(),
+ LU_KEYWORD_WORD, &feed_writer, this, 1) != 0)
+ {
+ cerr << "Failed to index un-typed segment" << endl;
+ return -1;
+ }
+
+ return 0;
+}
+
+int Index::index_keywords(DwEntity& e, const string& parentCharset)
+{
+ string charset = parentCharset;
+
+ if (e.Headers().HasContentType())
+ {
+ DwMediaType& mt = e.Headers().ContentType();
+
+ for (DwParameter* p = mt.FirstParameter(); p; p = p->Next())
+ {
+ DwString attr = p->Attribute();
+ attr.ConvertToLowerCase(); // case insens
+ if (attr == "charset") charset = p->Value().c_str();
+ }
+ }
+
+ // if (e.hasHeaders() &&
+ if (e.Headers().HasContentType())
+ {
+ DwMediaType& t = e.Headers().ContentType();
+ switch (t.Type())
+ {
+ case DwMime::kTypeMessage:
+ if (e.Body().Message())
+ index_keywords(*e.Body().Message(), charset);
+ break;
+
+ case DwMime::kTypeMultipart:
+ // index all alternatives in multipart
+ for (DwBodyPart* p = e.Body().FirstBodyPart(); p != 0; p = p->Next())
+ index_keywords(*p, charset);
+ break;
+
+ case DwMime::kTypeText:
+ if (t.Subtype() == DwMime::kSubtypePlain)
+ {
+ if (index_entity(e, charset) != 0) return -1;
+ }
+ break;
+ }
+ }
+ else
+ {
+ if (index_entity(e, charset) != 0) return -1;
+ }
+
+ return 0;
+}
+
+int Index::index(bool userdate, time_t envelope, time_t import, bool check, bool& exist)
+{
+ exist = false;
+
+// cout << message.Headers().Subject().AsString().c_str() << endl;
+
+ if (index_author() < 0) return -1;
+
+ /* If the message is blacklisted, we mark it as 'existing' */
+ if (index_id(userdate, envelope, exist) < 0) return -1;
+ if (exist) return 0;
+
+ /* If the message is already imported, mark it as 'existing' */
+ if (index_summary(check, exist) < 0) return -1;
+ if (exist) return 0;
+
+ if (index_threading( ) < 0) return -1;
+ if (index_control (import) < 0) return -1;
+ if (index_keywords (message, "ISO-8859-1") < 0) return -1;
+
+ return 0;
+}