aboutsummaryrefslogblamecommitdiffstats
path: root/e-util/ename/e-name-western.c
blob: 698cceec0b55ceb0d63f77b1e09d1f4dde194a2e (plain) (tree)
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863



















































                                                               
                                                                   


                                         
                                                 
                              









                                                                                   




































































































































































































































                                                                                          




                                                             


                                                   





































































































































































































































































































































































































































































































































































                                                                                           
                         

                   
/*
 * A simple Western name parser.
 *
 * <Nat> Jamie, do you know anything about name parsing?
 * <jwz> Are you going down that rat hole?  Bring a flashlight.
 *
 * Author:
 *   Nat Friedman (nat@helixcode.com)
 *
 * Copyright 1999, Helix Code, Inc.
 */

#include <ctype.h>
#include <string.h>
#include <glib.h>
 
#include <ename/e-name-western.h>
#include <ename/e-name-western-tables.h>

typedef struct {
    int prefix_idx;
    int first_idx;
    int middle_idx;
    int nick_idx;
    int last_idx;
    int suffix_idx;
} ENameWesternIdxs;

static int
e_name_western_str_count_words (char *str)
{
    int word_count;
    char *p;

    word_count = 0;

    for (p = str; p != NULL; p = strchr (p, ' ')) {
        word_count ++;
        p ++;
    }

    return word_count;
}

static void
e_name_western_cleanup_string (char **str)
{
    char *newstr;
    char *p;

    if (*str == NULL)
        return;

    /* skip any spaces and commas at the start of the string */
    p = *str;
    while (isspace (*p) || *p == ',')
        p ++;

    /* make the copy we're going to return */
    newstr = g_strdup (p);

    if ( strlen(newstr) > 0) {
        /* now search from the back, skipping over any spaces and commas */
        p = newstr + strlen (newstr) - 1;
        while (isspace (*p) || *p == ',')
            p --;
        /* advance p to after the character that caused us to exit the
           previous loop, and end the string. */
        if ((! isspace (*p)) && *p != ',')
            p ++;
        *p = '\0';
    }

    g_free (*str);
    *str = newstr;
}

static char *
e_name_western_get_words_at_idx (char *str, int idx, int num_words)
{
    char *words;
    char *p;
    int   word_count;
    int   words_len;

    /*
     * Walk to the end of the words.
     */
    word_count = 0;
    p = str + idx;
    while (word_count < num_words && *p != '\0') {
        while (! isspace (*p) && *p != '\0')
            p ++;

        while (isspace (*p) && *p != '\0')
            p ++;

        word_count ++;
    }

    words_len = p - str - idx - 1;

    if (*p == '\0')
        words_len ++;

    words = g_malloc0 (1 + words_len);
    strncpy (words, str + idx, words_len);

    return words;
}

/*
 * What the fuck is wrong with glib's MAX macro.
 */ 
static int
e_name_western_max (const int a, const int b)
{
    if (a > b)
        return a;

    return b;
}

static gboolean
e_name_western_word_is_suffix (char *word)
{
    int i;

    for (i = 0; e_name_western_sfx_table [i] != NULL; i ++) {
        if (g_strcasecmp (word, e_name_western_sfx_table [i]))
            continue;

        return TRUE;
    }

    return FALSE;
}

static char *
e_name_western_get_one_prefix_at_str (char *str)
{
    char *word;
    int   i;

    /*
     * Check for prefixes from our table.
     */
    for (i = 0; e_name_western_pfx_table [i] != NULL; i ++) {
        int pfx_words;
        char *words;

        pfx_words = e_name_western_str_count_words (e_name_western_pfx_table [i]);
        words = e_name_western_get_words_at_idx (str, 0, pfx_words);

        if (! g_strcasecmp (words, e_name_western_pfx_table [i]))
            return words;

        g_free (words);
    }

    /*
     * Check for prefixes we don't know about.  These are always a
     * sequence of more than one letters followed by a period.
     */
    word = e_name_western_get_words_at_idx (str, 0, 1);

    if (strlen (word) > 2 && isalpha (word [0]) && isalpha (word [1]) &&
        word [strlen (word) - 1] == '.')
        return word;

    g_free (word);

    return NULL;
}

static char *
e_name_western_get_prefix_at_str (char *str)
{
    char *pfx;
    char *pfx1;
    char *pfx2;
    char *p;

    /* Get the first prefix. */
    pfx1 = e_name_western_get_one_prefix_at_str (str);

    if (pfx1 == NULL)
        return NULL;

    /* Check for a second prefix. */
    p = str + strlen (pfx1);
    while (isspace (*p) && *p != '\0')
        p ++;

    pfx2 = e_name_western_get_one_prefix_at_str (p);

    if (pfx2 != NULL) {
        int pfx_len;

        pfx_len = (p + strlen (pfx2)) - str;
        pfx = g_malloc0 (pfx_len + 1);
        strncpy (pfx, str, pfx_len);
    } else {
        pfx = g_strdup (pfx1);
    }

    g_free (pfx1);
    g_free (pfx2);

    return pfx;
}

static void
e_name_western_extract_prefix (ENameWestern *name, ENameWesternIdxs *idxs)
{
    char *pfx;

    pfx = e_name_western_get_prefix_at_str (name->full);

    if (pfx == NULL)
        return;

    idxs->prefix_idx = 0;
    name->prefix     = pfx;
}

static gboolean
e_name_western_is_complex_last_beginning (char *word)
{
    int i;

    for (i = 0; e_name_western_complex_last_table [i] != NULL; i ++) {

        if (! g_strcasecmp (
            word, e_name_western_complex_last_table [i]))
            return TRUE;
    }

    return FALSE;
}

static void
e_name_western_extract_first (ENameWestern *name, ENameWesternIdxs *idxs)
{
    /*
     * If there's a prefix, then the first name is right after it.
     */
    if (idxs->prefix_idx != -1) {
        int   first_idx;
        char *p;

        first_idx = idxs->prefix_idx + strlen (name->prefix);

        /* Skip past white space. */
        p = name->full + first_idx;
        while (isspace (*p) && *p != '\0')
            p++;

        if (*p == '\0')
            return;

        idxs->first_idx = p - name->full;
        name->first = e_name_western_get_words_at_idx (
            name->full, idxs->first_idx, 1);

    } else {

        /*
         * Otherwise, the first name is probably the first string.
         */
        idxs->first_idx = 0;
        name->first = e_name_western_get_words_at_idx (
            name->full, idxs->first_idx, 1);
    }

    /*
     * Check that we didn't just assign the beginning of a
     * compound last name to the first name.
     */
    if (name->first != NULL) {
        if (e_name_western_is_complex_last_beginning (name->first)) {
            g_free (name->first);
            name->first = NULL;
            idxs->first_idx = -1;
        }
    }
}

static void
e_name_western_extract_middle (ENameWestern *name, ENameWesternIdxs *idxs)
{
    char *word;
    int   middle_idx;

    /*
     * Middle names can only exist if you have a first name.
     */
    if (idxs->first_idx == -1)
        return;

    middle_idx = idxs->first_idx + strlen (name->first) + 1;

    if (middle_idx > strlen (name->full))
        return;
    
    /*
     * Search for the first space (or the terminating \0)
     */
    while (isspace (name->full [middle_idx]) &&
           name->full [middle_idx] != '\0')
        middle_idx ++;
        
    if (name->full [middle_idx] == '\0')
        return;

    /*
     * Skip past the nickname, if it's there.
     */
    if (name->full [middle_idx] == '\"') {
        if (idxs->nick_idx == -1)
            return;

        middle_idx = idxs->nick_idx + strlen (name->nick) + 1;
        
        while (isspace (name->full [middle_idx]) &&
               name->full [middle_idx] != '\0')
            middle_idx ++;

        if (name->full [middle_idx] == '\0')
            return;
    }

    /*
     * Make sure this isn't the beginning of a complex last name.
     */
    word = e_name_western_get_words_at_idx (name->full, middle_idx, 1);
    if (e_name_western_is_complex_last_beginning (word)) {
        g_free (word);
        return;
    }

    /*
     * Make sure this isn't a suffix.
     */
    e_name_western_cleanup_string (& word);
    if (e_name_western_word_is_suffix (word)) {
        g_free (word);
        return;
    }

    /*
     * Make sure we didn't just grab a cute nickname.
     */
    if (word [0] == '\"') {
        g_free (word);
        return;
    }
    
    idxs->middle_idx = middle_idx;
    name->middle = word;
}

static void
e_name_western_extract_nickname (ENameWestern *name, ENameWesternIdxs *idxs)
{
    int   idx;
    int   start_idx;
    char *str;

    if (idxs->first_idx == -1)
        return;

    if (idxs->middle_idx > idxs->first_idx)
        idx = idxs->middle_idx + strlen (name->middle);
    else
        idx = idxs->first_idx + strlen (name->first);

    while (name->full [idx] != '\"' && name->full [idx] != '\0')
        idx ++;

    if (name->full [idx] != '\"')
        return;

    start_idx = idx;

    /*
     * Advance to the next double quote.
     */
    idx ++;
    
    while (name->full [idx] != '\"' && name->full [idx] != '\0')
        idx ++;

    if (name->full [idx] == '\0')
        return;

    str = g_malloc0 (idx - start_idx + 2);
    strncpy (str, name->full + start_idx, idx - start_idx + 1);

    name->nick = str;
    idxs->nick_idx = start_idx;
}

static int
e_name_western_last_get_max_idx (ENameWestern *name, ENameWesternIdxs *idxs)
{
    int max_idx = -1;

    if (name->prefix != NULL)
        max_idx = e_name_western_max (
            max_idx, idxs->prefix_idx + strlen (name->prefix));

    if (name->first != NULL)
        max_idx = e_name_western_max (
            max_idx, idxs->first_idx + strlen (name->first));

    if (name->middle != NULL)
        max_idx = e_name_western_max (
            max_idx, idxs->middle_idx + strlen (name->middle));

    if (name->nick != NULL)
        max_idx = e_name_western_max (
            max_idx, idxs->nick_idx + strlen (name->nick));

    return max_idx;
}

static void
e_name_western_extract_last (ENameWestern *name, ENameWesternIdxs *idxs)
{
    char *word;
    int   idx = -1;

    idx = e_name_western_last_get_max_idx (name, idxs);

    /*
     * In the case where there is no preceding name element, the
     * name is either just a first name ("Nat", "John"), is a
     * single-element name ("Cher", which we treat as a first
     * name), or is just a last name.  The only time we can
     * differentiate a last name alone from a single-element name
     * or a first name alone is if it's a complex last name ("de
     * Icaza", "van Josephsen").  So if there is no preceding name
     * element, we check to see whether or not the first part of
     * the name is the beginning of a complex name.  If it is,
     * we subsume the entire string.  If we accidentally subsume
     * the suffix, this will get fixed in the fixup routine.
     */
    if (idx == -1) {
        word = e_name_western_get_words_at_idx (name->full, 0, 1);
        if (! e_name_western_is_complex_last_beginning (word)) {
            g_free (word);
            return;
        }

        name->last     = g_strdup (name->full);
        idxs->last_idx = 0;
        return;
    }

    /* Skip past the white space. */
    while (isspace (name->full [idx]) && name->full [idx] != '\0')
        idx ++;

    if (name->full [idx] == '\0')
        return;

    word = e_name_western_get_words_at_idx (name->full, idx, 1);
    e_name_western_cleanup_string (& word);
    if (e_name_western_word_is_suffix (word)) {
        g_free (word);
        return;
    }
    g_free (word);

    /*
     * Subsume the rest of the string into the last name.  If we
     * accidentally include the prefix, it will get fixed later.
     * This is the only way to handle things like "Miguel de Icaza
     * Amozorrutia" without dropping data and forcing the user
     * to retype it.
     */
    name->last = g_strdup (name->full + idx);
    idxs->last_idx = idx;
}

static char *
e_name_western_get_preceding_word (char *str, int idx)
{
    int   word_len;
    char *word;
    char *p;

    p = str + idx;

    while (isspace (*p) && p > str)
        p --;

    while (! isspace (*p) && p > str)
        p --;

    if (isspace (*p))
        p ++;

    word_len = (str + idx) - p;
    word = g_malloc0 (word_len + 1);
    if (word_len > 0)
        strncpy (word, p, word_len);

    return word;
}

static char *
e_name_western_get_suffix_at_str_end (char *str)
{
    char *suffix;
    char *p;

    /*
     * Walk backwards till we reach the beginning of the
     * (potentially-comma-separated) list of suffixes.
     */
    p = str + strlen (str);
    while (1) {
        char *nextp;
        char *word;

        word = e_name_western_get_preceding_word (str, p - str);
        nextp = p - strlen (word) - 1;
        
        e_name_western_cleanup_string (& word);

        if (e_name_western_word_is_suffix (word)) {
            p = nextp;
            g_free (word);
        } else {
            g_free (word);
            break;
        }
    }

    if (p == (str + strlen (str)))
        return NULL;

    suffix = g_strdup (p);
    e_name_western_cleanup_string (& suffix);

    if (strlen (suffix) == 0) {
        g_free (suffix);
        return NULL;
    }

    return suffix;
}

static void
e_name_western_extract_suffix (ENameWestern *name, ENameWesternIdxs *idxs)
{

    name->suffix = e_name_western_get_suffix_at_str_end (name->full);

    if (name->suffix == NULL)
        return;

    idxs->suffix_idx = strlen (name->full) - strlen (name->suffix);
}

static gboolean
e_name_western_detect_backwards (ENameWestern *name, ENameWesternIdxs *idxs)
{
    char *comma;
    char *word;

    comma = strchr (name->full, ',');

    if (comma == NULL)
        return FALSE;

    /*
     * If there's a comma, we need to detect whether it's
     * separating the last name from the first or just separating
     * suffixes.  So we grab the word which comes before the
     * comma and check if it's a suffix.
     */
    word = e_name_western_get_preceding_word (name->full, comma - name->full);

    if (e_name_western_word_is_suffix (word)) {
        g_free (word);
        return FALSE;
    }

    g_free (word);
    return TRUE;
}

static void
e_name_western_reorder_asshole (ENameWestern *name, ENameWesternIdxs *idxs)
{
    char *prefix;
    char *last;
    char *suffix;
    char *firstmidnick;
    char *newfull;

    char *comma;
    char *p;

    if (! e_name_western_detect_backwards (name, idxs))
        return;

    /*
     * Convert
     *    <Prefix> <Last name>, <First name> <Middle[+nick] name> <Suffix>
     * to
     *    <Prefix> <First name> <Middle[+nick] name> <Last name> <Suffix>
     */
    
    /*
     * Grab the prefix from the beginning.
     */
    prefix = e_name_western_get_prefix_at_str (name->full);

    /*
     * Everything from the end of the prefix to the comma is the
     * last name.
     */
    comma = strchr (name->full, ',');
    if (comma == NULL)
        return;

    p = name->full + (prefix == NULL ? 0 : strlen (prefix));

    while (isspace (*p) && *p != '\0')
        p ++;

    last = g_malloc0 (comma - p + 1);
    strncpy (last, p, comma - p);

    /*
     * Get the suffix off the end.
     */
    suffix = e_name_western_get_suffix_at_str_end (name->full);

    /*
     * Firstmidnick is everything from the comma to the beginning
     * of the suffix.
     */
    p = comma + 1;

    while (isspace (*p) && *p != '\0')
        p ++;

    if (suffix != NULL) {
        char *q;

        /*
         * Point q at the beginning of the suffix.
         */
        q = name->full + strlen (name->full) - strlen (suffix) - 1;

        /*
         * Walk backwards until we hit the space which
         * separates the suffix from firstmidnick.
         */
        while (! isspace (*q) && q > comma)
            q --;

        if ((q - p + 1) > 0) {
            firstmidnick = g_malloc0 (q - p + 1);
            strncpy (firstmidnick, p, q - p);
        } else
            firstmidnick = NULL;
    } else {
        firstmidnick = g_strdup (p);
    }

    /*
     * Create our new reordered version of the name.
     */
#define NULLSTR(a) ((a) == NULL ? "" : (a))
    newfull = g_strdup_printf ("%s %s %s %s", NULLSTR (prefix), NULLSTR (firstmidnick),
                   NULLSTR (last), NULLSTR (suffix));
    g_strstrip (newfull);
    g_free (name->full);
    name->full = newfull;


    g_free (prefix);
    g_free (firstmidnick);
    g_free (last);
    g_free (suffix);
}

static void
e_name_western_zap_nil (char **str, int *idx)
{
    if (*str == NULL)
        return;

    if (strlen (*str) != 0)
        return;

    *idx = -1;
    g_free (*str);
    *str = NULL;
}

static void
e_name_western_fixup (ENameWestern *name, ENameWesternIdxs *idxs)
{
    /*
     * The middle and last names cannot be the same.
     */
    if (idxs->middle_idx != -1 && idxs->middle_idx == idxs->last_idx) {
        idxs->middle_idx = -1;
        g_free (name->middle);
        name->middle = NULL;
    }

    /*
     * If we have a middle name and no last name, then we mistook
     * the last name for the middle name.
     */
    if (idxs->last_idx == -1 && idxs->middle_idx != -1) {
        idxs->last_idx   = idxs->middle_idx;
        name->last       = name->middle;
        name->middle     = NULL;
        idxs->middle_idx = -1;
    }

    /*
     * Check to see if we accidentally included the suffix in the
     * last name.
     */
    if (idxs->suffix_idx != -1 && idxs->last_idx != -1 &&
        idxs->suffix_idx < (idxs->last_idx + strlen (name->last))) {
        char *sfx;

        sfx = name->last + (idxs->suffix_idx - idxs->last_idx);
        if (sfx != NULL) {
            char *newlast;
            char *p;

            p = sfx - 1;
            while (isspace (*p) && p > name->last)
                p --;
            p ++;

            newlast = g_malloc0 (p - name->last + 1);
            strncpy (newlast, name->last, p - name->last);
            g_free (name->last);
            name->last = newlast;
        }
    }

    /*
     * If we have a prefix and a first name, but no last name,
     * then we need to assign the first name to the last name.
     * This way we get things like "Mr Friedman" correctly.
     */
    if (idxs->first_idx != -1 && idxs->prefix_idx != -1 &&
        idxs->last_idx == -1) {
        name->last      = name->first;
        idxs->last_idx  = idxs->first_idx;
        idxs->first_idx = -1;
        name->first     = NULL;
    }

    /*
     * Remove stray spaces and commas (although there don't seem
     * to be any in the test cases, they might show up later).
     */
    e_name_western_cleanup_string (& name->prefix);
    e_name_western_cleanup_string (& name->first);
    e_name_western_cleanup_string (& name->middle);
    e_name_western_cleanup_string (& name->nick);
    e_name_western_cleanup_string (& name->last);
    e_name_western_cleanup_string (& name->suffix);

    /*
     * Make zero-length strings just NULL.
     */
    e_name_western_zap_nil (& name->prefix, & idxs->prefix_idx);
    e_name_western_zap_nil (& name->first,  & idxs->first_idx);
    e_name_western_zap_nil (& name->middle, & idxs->middle_idx);
    e_name_western_zap_nil (& name->nick,   & idxs->nick_idx);
    e_name_western_zap_nil (& name->last,   & idxs->last_idx);
    e_name_western_zap_nil (& name->suffix, & idxs->suffix_idx);
}

/**
 * e_name_western_western_parse_fullname:
 * @full_name: A string containing a Western name.
 *
 * Parses @full_name and returns an #ENameWestern object filled with
 * the component parts of the name.
 */
ENameWestern *
e_name_western_parse (const char *full_name)
{
    ENameWesternIdxs *idxs;
    ENameWestern *wname;

    wname = g_new0 (ENameWestern, 1);

    wname->full = g_strdup (full_name);

    idxs = g_new0 (ENameWesternIdxs, 1);

    idxs->prefix_idx = -1;
    idxs->first_idx  = -1;
    idxs->middle_idx = -1;
    idxs->nick_idx   = -1;
    idxs->last_idx   = -1;
    idxs->suffix_idx = -1;
    
    /*
     * An extremely simple algorithm.
     *
     * The goal here is to get it right 95% of the time for
     * Western names.
     *
     * First we check to see if this is an ass-backwards name
     * ("Prefix Last, First Middle Suffix").  These names really
     * suck (imagine "Dr von Johnson, Albert Roderick Jr"), so
     * we reorder them first and then parse them.
     *
     * Next, we grab the most obvious assignments for the various
     * parts of the name.  Once this is done, we check for stupid
     * errors and fix them up.
     */
    e_name_western_reorder_asshole  (wname, idxs);

    e_name_western_extract_prefix   (wname, idxs);
    e_name_western_extract_first    (wname, idxs);
    e_name_western_extract_nickname (wname, idxs);
    e_name_western_extract_middle   (wname, idxs);
    e_name_western_extract_last     (wname, idxs);
    e_name_western_extract_suffix   (wname, idxs);

    e_name_western_fixup            (wname, idxs);

    g_free (idxs);

    return wname;
}

/**
 * e_name_western_free:
 * @name: An ENameWestern object which needs to be freed.
 *
 * Deep-frees @name
 */
void
e_name_western_free (ENameWestern *w)
{

    g_free (w->prefix);
    g_free (w->first);
    g_free (w->middle);
    g_free (w->nick);
    g_free (w->last);
    g_free (w->suffix);
    
    g_free (w->full);

    g_free (w);
}