From 5ad0f6610fbaa0c3a3d72296815fe568864b07bd Mon Sep 17 00:00:00 2001 From: Nat Friedman Date: Thu, 27 Apr 2000 15:31:08 +0000 Subject: Oops, forgot this. svn path=/trunk/; revision=2657 --- e-util/ename/e-name-western.c | 856 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 856 insertions(+) create mode 100644 e-util/ename/e-name-western.c (limited to 'e-util/ename/e-name-western.c') diff --git a/e-util/ename/e-name-western.c b/e-util/ename/e-name-western.c new file mode 100644 index 0000000000..98e03a6682 --- /dev/null +++ b/e-util/ename/e-name-western.c @@ -0,0 +1,856 @@ +/* + * A simple Western name parser. + * + * Jamie, do you know anything about name parsing? + * Are you going down that rat hole? Bring a flashlight. + * + * Author: + * Nat Friedman (nat@helixcode.com) + * + * Copyright 1999, Helix Code, Inc. + */ + +#include +#include +#include + +#include +#include + +typedef struct { + int prefix_idx; + int first_idx; + int middle_idx; + int nick_idx; + int last_idx; + int suffix_idx; +} ENameWesternIdxs; + +static int +e_name_western_str_count_words (char *str) +{ + int word_count; + char *p; + + word_count = 0; + + for (p = str; p != NULL; p = strchr (p, ' ')) { + word_count ++; + p ++; + } + + return word_count; +} + +static void +e_name_western_cleanup_string (char **str) +{ + char *newstr; + char *p; + + if (*str == NULL) + return; + + p = *str; + while (isspace (*p) || *p == ',') + p ++; + + newstr = g_strdup (p); + + p = newstr + strlen (newstr) - 1; + while (isspace (*p) || *p == ',') + p --; + if ((! isspace (*p)) && *p != ',') + p ++; + *p = '\0'; + + g_free (*str); + *str = newstr; +} + +static char * +e_name_western_get_words_at_idx (char *str, int idx, int num_words) +{ + char *words; + char *p; + int word_count; + int words_len; + + /* + * Walk to the end of the words. + */ + word_count = 0; + p = str + idx; + while (word_count < num_words && *p != '\0') { + while (! isspace (*p) && *p != '\0') + p ++; + + while (isspace (*p) && *p != '\0') + p ++; + + word_count ++; + } + + words_len = p - str - idx - 1; + + if (*p == '\0') + words_len ++; + + words = g_malloc0 (1 + words_len); + strncpy (words, str + idx, words_len); + + return words; +} + +/* + * What the fuck is wrong with glib's MAX macro. + */ +static int +e_name_western_max (const int a, const int b) +{ + if (a > b) + return a; + + return b; +} + +static gboolean +e_name_western_word_is_suffix (char *word) +{ + int i; + + for (i = 0; e_name_western_sfx_table [i] != NULL; i ++) { + if (g_strcasecmp (word, e_name_western_sfx_table [i])) + continue; + + return TRUE; + } + + return FALSE; +} + +static char * +e_name_western_get_one_prefix_at_str (char *str) +{ + char *word; + int i; + + /* + * Check for prefixes from our table. + */ + for (i = 0; e_name_western_pfx_table [i] != NULL; i ++) { + int pfx_words; + char *words; + + pfx_words = e_name_western_str_count_words (e_name_western_pfx_table [i]); + words = e_name_western_get_words_at_idx (str, 0, pfx_words); + + if (! g_strcasecmp (words, e_name_western_pfx_table [i])) + return words; + + g_free (words); + } + + /* + * Check for prefixes we don't know about. These are always a + * sequence of more than one letters followed by a period. + */ + word = e_name_western_get_words_at_idx (str, 0, 1); + + if (strlen (word) > 2 && isalpha (word [0]) && isalpha (word [1]) && + word [strlen (word) - 1] == '.') + return word; + + g_free (word); + + return NULL; +} + +static char * +e_name_western_get_prefix_at_str (char *str) +{ + char *pfx; + char *pfx1; + char *pfx2; + char *p; + + /* Get the first prefix. */ + pfx1 = e_name_western_get_one_prefix_at_str (str); + + if (pfx1 == NULL) + return NULL; + + /* Check for a second prefix. */ + p = str + strlen (pfx1); + while (isspace (*p) && *p != '\0') + p ++; + + pfx2 = e_name_western_get_one_prefix_at_str (p); + + if (pfx2 != NULL) { + int pfx_len; + + pfx_len = (p + strlen (pfx2)) - str; + pfx = g_malloc0 (pfx_len + 1); + strncpy (pfx, str, pfx_len); + } else { + pfx = g_strdup (pfx1); + } + + g_free (pfx1); + g_free (pfx2); + + return pfx; +} + +static void +e_name_western_extract_prefix (ENameWestern *name, ENameWesternIdxs *idxs) +{ + char *pfx; + + pfx = e_name_western_get_prefix_at_str (name->full); + + if (pfx == NULL) + return; + + idxs->prefix_idx = 0; + name->prefix = pfx; +} + +static gboolean +e_name_western_is_complex_last_beginning (char *word) +{ + int i; + + for (i = 0; e_name_western_complex_last_table [i] != NULL; i ++) { + + if (! g_strcasecmp ( + word, e_name_western_complex_last_table [i])) + return TRUE; + } + + return FALSE; +} + +static void +e_name_western_extract_first (ENameWestern *name, ENameWesternIdxs *idxs) +{ + /* + * If there's a prefix, then the first name is right after it. + */ + if (idxs->prefix_idx != -1) { + int first_idx; + char *p; + + first_idx = idxs->prefix_idx + strlen (name->prefix); + + /* Skip past white space. */ + p = name->full + first_idx; + while (isspace (*p) && *p != '\0') + p++; + + if (*p == '\0') + return; + + idxs->first_idx = p - name->full; + name->first = e_name_western_get_words_at_idx ( + name->full, idxs->first_idx, 1); + + } else { + + /* + * Otherwise, the first name is probably the first string. + */ + idxs->first_idx = 0; + name->first = e_name_western_get_words_at_idx ( + name->full, idxs->first_idx, 1); + } + + /* + * Check that we didn't just assign the beginning of a + * compound last name to the first name. + */ + if (name->first != NULL) { + if (e_name_western_is_complex_last_beginning (name->first)) { + g_free (name->first); + name->first = NULL; + idxs->first_idx = -1; + } + } +} + +static void +e_name_western_extract_middle (ENameWestern *name, ENameWesternIdxs *idxs) +{ + char *word; + int middle_idx; + + /* + * Middle names can only exist if you have a first name. + */ + if (idxs->first_idx == -1) + return; + + middle_idx = idxs->first_idx + strlen (name->first) + 1; + + while (isspace (name->full [middle_idx]) && + name->full [middle_idx] != '\0') + middle_idx ++; + + if (middle_idx > strlen (name->full)) + return; + + if (name->full [middle_idx] == '\0') + return; + + /* + * Skip past the nickname, if it's there. + */ + if (name->full [middle_idx] == '\"') { + if (idxs->nick_idx == -1) + return; + + middle_idx = idxs->nick_idx + strlen (name->nick) + 1; + + while (isspace (name->full [middle_idx]) && + name->full [middle_idx] != '\0') + middle_idx ++; + + if (name->full [middle_idx] == '\0') + return; + } + + /* + * Make sure this isn't the beginning of a complex last name. + */ + word = e_name_western_get_words_at_idx (name->full, middle_idx, 1); + if (e_name_western_is_complex_last_beginning (word)) { + g_free (word); + return; + } + + /* + * Make sure this isn't a suffix. + */ + e_name_western_cleanup_string (& word); + if (e_name_western_word_is_suffix (word)) { + g_free (word); + return; + } + + /* + * Make sure we didn't just grab a cute nickname. + */ + if (word [0] == '\"') { + g_free (word); + return; + } + + idxs->middle_idx = middle_idx; + name->middle = word; +} + +static void +e_name_western_extract_nickname (ENameWestern *name, ENameWesternIdxs *idxs) +{ + int idx; + int start_idx; + char *str; + + if (idxs->first_idx == -1) + return; + + if (idxs->middle_idx > idxs->first_idx) + idx = idxs->middle_idx + strlen (name->middle); + else + idx = idxs->first_idx + strlen (name->first); + + while (name->full [idx] != '\"' && name->full [idx] != '\0') + idx ++; + + if (name->full [idx] != '\"') + return; + + start_idx = idx; + + /* + * Advance to the next double quote. + */ + idx ++; + + while (name->full [idx] != '\"' && name->full [idx] != '\0') + idx ++; + + if (name->full [idx] == '\0') + return; + + str = g_malloc0 (idx - start_idx + 2); + strncpy (str, name->full + start_idx, idx - start_idx + 1); + + name->nick = str; + idxs->nick_idx = start_idx; +} + +static int +e_name_western_last_get_max_idx (ENameWestern *name, ENameWesternIdxs *idxs) +{ + int max_idx = -1; + + if (name->prefix != NULL) + max_idx = e_name_western_max ( + max_idx, idxs->prefix_idx + strlen (name->prefix)); + + if (name->first != NULL) + max_idx = e_name_western_max ( + max_idx, idxs->first_idx + strlen (name->first)); + + if (name->middle != NULL) + max_idx = e_name_western_max ( + max_idx, idxs->middle_idx + strlen (name->middle)); + + if (name->nick != NULL) + max_idx = e_name_western_max ( + max_idx, idxs->nick_idx + strlen (name->nick)); + + return max_idx; +} + +static void +e_name_western_extract_last (ENameWestern *name, ENameWesternIdxs *idxs) +{ + char *word; + int idx = -1; + + idx = e_name_western_last_get_max_idx (name, idxs); + + /* + * In the case where there is no preceding name element, the + * name is either just a first name ("Nat", "John"), is a + * single-element name ("Cher", which we treat as a first + * name), or is just a last name. The only time we can + * differentiate a last name alone from a single-element name + * or a first name alone is if it's a complex last name ("de + * Icaza", "van Josephsen"). So if there is no preceding name + * element, we check to see whether or not the first part of + * the name is the beginning of a complex name. If it is, + * we subsume the entire string. If we accidentally subsume + * the suffix, this will get fixed in the fixup routine. + */ + if (idx == -1) { + word = e_name_western_get_words_at_idx (name->full, 0, 1); + if (! e_name_western_is_complex_last_beginning (word)) { + g_free (word); + return; + } + + name->last = g_strdup (name->full); + idxs->last_idx = 0; + return; + } + + /* Skip past the white space. */ + while (isspace (name->full [idx]) && name->full [idx] != '\0') + idx ++; + + if (name->full [idx] == '\0') + return; + + word = e_name_western_get_words_at_idx (name->full, idx, 1); + e_name_western_cleanup_string (& word); + if (e_name_western_word_is_suffix (word)) { + g_free (word); + return; + } + g_free (word); + + /* + * Subsume the rest of the string into the last name. If we + * accidentally include the prefix, it will get fixed later. + * This is the only way to handle things like "Miguel de Icaza + * Amozorrutia" without dropping data and forcing the user + * to retype it. + */ + name->last = g_strdup (name->full + idx); + idxs->last_idx = idx; +} + +static char * +e_name_western_get_preceding_word (char *str, int idx) +{ + int word_len; + char *word; + char *p; + + p = str + idx; + + while (isspace (*p) && p > str) + p --; + + while (! isspace (*p) && p > str) + p --; + + if (isspace (*p)) + p ++; + + word_len = (str + idx) - p; + word = g_malloc0 (word_len + 1); + if (word_len > 0) + strncpy (word, p, word_len); + + return word; +} + +static char * +e_name_western_get_suffix_at_str_end (char *str) +{ + char *suffix; + char *p; + + /* + * Walk backwards till we reach the beginning of the + * (potentially-comma-separated) list of suffixes. + */ + p = str + strlen (str); + while (1) { + char *nextp; + char *word; + + word = e_name_western_get_preceding_word (str, p - str); + nextp = p - strlen (word) - 1; + + e_name_western_cleanup_string (& word); + + if (e_name_western_word_is_suffix (word)) { + p = nextp; + g_free (word); + } else { + g_free (word); + break; + } + } + + if (p == (str + strlen (str))) + return NULL; + + suffix = g_strdup (p); + e_name_western_cleanup_string (& suffix); + + if (strlen (suffix) == 0) { + g_free (suffix); + return NULL; + } + + return suffix; +} + +static void +e_name_western_extract_suffix (ENameWestern *name, ENameWesternIdxs *idxs) +{ + + name->suffix = e_name_western_get_suffix_at_str_end (name->full); + + if (name->suffix == NULL) + return; + + idxs->suffix_idx = strlen (name->full) - strlen (name->suffix); +} + +static gboolean +e_name_western_detect_backwards (ENameWestern *name, ENameWesternIdxs *idxs) +{ + char *comma; + char *word; + + comma = strchr (name->full, ','); + + if (comma == NULL) + return FALSE; + + /* + * If there's a comma, we need to detect whether it's + * separating the last name from the first or just separating + * suffixes. So we grab the word which comes before the + * comma and check if it's a suffix. + */ + word = e_name_western_get_preceding_word (name->full, comma - name->full); + + if (e_name_western_word_is_suffix (word)) { + g_free (word); + return FALSE; + } + + g_free (word); + return TRUE; +} + +static void +e_name_western_reorder_asshole (ENameWestern *name, ENameWesternIdxs *idxs) +{ + char *prefix; + char *last; + char *suffix; + char *firstmidnick; + char *newfull; + + char *comma; + char *p; + + if (! e_name_western_detect_backwards (name, idxs)) + return; + + /* + * Convert + * , + * to + * + */ + + /* + * Grab the prefix from the beginning. + */ + prefix = e_name_western_get_prefix_at_str (name->full); + + /* + * Everything from the end of the prefix to the comma is the + * last name. + */ + comma = strchr (name->full, ','); + if (comma == NULL) + return; + + p = name->full + (prefix == NULL ? 0 : strlen (prefix)); + + while (isspace (*p) && *p != '\0') + p ++; + + last = g_malloc0 (comma - p + 1); + strncpy (last, p, comma - p); + + /* + * Get the suffix off the end. + */ + suffix = e_name_western_get_suffix_at_str_end (name->full); + + /* + * Firstmidnick is everything from the comma to the beginning + * of the suffix. + */ + p = comma + 1; + + while (isspace (*p) && *p != '\0') + p ++; + + if (suffix != NULL) { + char *q; + + /* + * Point q at the beginning of the suffix. + */ + q = name->full + strlen (name->full) - strlen (suffix) - 1; + + /* + * Walk backwards until we hit the space which + * separates the suffix from firstmidnick. + */ + while (! isspace (*q) && q > comma) + q --; + + if ((q - p + 1) > 0) { + firstmidnick = g_malloc0 (q - p + 1); + strncpy (firstmidnick, p, q - p); + } else + firstmidnick = NULL; + } else { + firstmidnick = g_strdup (p); + } + + /* + * Create our new reordered version of the name. + */ +#define NULLSTR(a) ((a) == NULL ? "" : (a)) + newfull = g_strdup_printf ("%s %s %s %s", NULLSTR (prefix), NULLSTR (firstmidnick), + NULLSTR (last), NULLSTR (suffix)); + g_strstrip (newfull); + g_free (name->full); + name->full = newfull; + + + g_free (prefix); + g_free (firstmidnick); + g_free (last); + g_free (suffix); +} + +static void +e_name_western_zap_nil (char **str, int *idx) +{ + if (*str == NULL) + return; + + if (strlen (*str) != 0) + return; + + *idx = -1; + g_free (*str); + *str = NULL; +} + +static void +e_name_western_fixup (ENameWestern *name, ENameWesternIdxs *idxs) +{ + /* + * The middle and last names cannot be the same. + */ + if (idxs->middle_idx != -1 && idxs->middle_idx == idxs->last_idx) { + idxs->middle_idx = -1; + g_free (name->middle); + name->middle = NULL; + } + + /* + * If we have a middle name and no last name, then we mistook + * the last name for the middle name. + */ + if (idxs->last_idx == -1 && idxs->middle_idx != -1) { + idxs->last_idx = idxs->middle_idx; + name->last = name->middle; + name->middle = NULL; + idxs->middle_idx = -1; + } + + /* + * Check to see if we accidentally included the suffix in the + * last name. + */ + if (idxs->suffix_idx != -1 && idxs->last_idx != -1 && + idxs->suffix_idx < (idxs->last_idx + strlen (name->last))) { + char *sfx; + + sfx = name->last + (idxs->suffix_idx - idxs->last_idx); + if (sfx != NULL) { + char *newlast; + char *p; + + p = sfx - 1; + while (isspace (*p) && p > name->last) + p --; + p ++; + + newlast = g_malloc0 (p - name->last + 1); + strncpy (newlast, name->last, p - name->last); + g_free (name->last); + name->last = newlast; + } + } + + /* + * If we have a prefix and a first name, but no last name, + * then we need to assign the first name to the last name. + * This way we get things like "Mr Friedman" correctly. + */ + if (idxs->first_idx != -1 && idxs->prefix_idx != -1 && + idxs->last_idx == -1) { + name->last = name->first; + idxs->last_idx = idxs->first_idx; + idxs->first_idx = -1; + name->first = NULL; + } + + /* + * Remove stray spaces and commas (although there don't seem + * to be any in the test cases, they might show up later). + */ + e_name_western_cleanup_string (& name->prefix); + e_name_western_cleanup_string (& name->first); + e_name_western_cleanup_string (& name->middle); + e_name_western_cleanup_string (& name->nick); + e_name_western_cleanup_string (& name->last); + e_name_western_cleanup_string (& name->suffix); + + /* + * Make zero-length strings just NULL. + */ + e_name_western_zap_nil (& name->prefix, & idxs->prefix_idx); + e_name_western_zap_nil (& name->first, & idxs->first_idx); + e_name_western_zap_nil (& name->middle, & idxs->middle_idx); + e_name_western_zap_nil (& name->nick, & idxs->nick_idx); + e_name_western_zap_nil (& name->last, & idxs->last_idx); + e_name_western_zap_nil (& name->suffix, & idxs->suffix_idx); +} + +/** + * e_name_western_western_parse_fullname: + * @full_name: A string containing a Western name. + * + * Parses @full_name and returns an #ENameWestern object filled with + * the component parts of the name. + */ +ENameWestern * +e_name_western_parse (const char *full_name) +{ + ENameWesternIdxs *idxs; + ENameWestern *wname; + + wname = g_new0 (ENameWestern, 1); + + wname->full = g_strdup (full_name); + + idxs = g_new0 (ENameWesternIdxs, 1); + + idxs->prefix_idx = -1; + idxs->first_idx = -1; + idxs->middle_idx = -1; + idxs->nick_idx = -1; + idxs->last_idx = -1; + idxs->suffix_idx = -1; + + /* + * An extremely simple algorithm. + * + * The goal here is to get it right 95% of the time for + * Western names. + * + * First we check to see if this is an ass-backwards name + * ("Prefix Last, First Middle Suffix"). These names really + * suck (imagine "Dr von Johnson, Albert Roderick Jr"), so + * we reorder them first and then parse them. + * + * Next, we grab the most obvious assignments for the various + * parts of the name. Once this is done, we check for stupid + * errors and fix them up. + */ + e_name_western_reorder_asshole (wname, idxs); + + e_name_western_extract_prefix (wname, idxs); + e_name_western_extract_first (wname, idxs); + e_name_western_extract_nickname (wname, idxs); + e_name_western_extract_middle (wname, idxs); + e_name_western_extract_last (wname, idxs); + e_name_western_extract_suffix (wname, idxs); + + e_name_western_fixup (wname, idxs); + + g_free (idxs); + + return wname; +} + +/** + * e_name_western_free: + * @name: An ENameWestern object which needs to be freed. + * + * Deep-frees @name + */ +void +e_name_western_free (ENameWestern *w) +{ + + g_free (w->prefix); + g_free (w->first); + g_free (w->middle); + g_free (w->nick); + g_free (w->last); + g_free (w->suffix); + + g_free (w); +} -- cgit v1.2.3