#!/bin/bash
#
# Split an mbox into message files, and sort them into directories according
# to message-id splitting.
#
# This needs to run under bash because some of the ${} stuff is a bash-ism.
#
# $Id$

. $HOME/etc/archives.conf

mbox=$1
destdir=$SPLITMBOX_DEST
export destdir
export TMPDIR=~/tmp

# *() and other patterns need this set
shopt -s extglob

if [ ! -f $mbox ]; then
  echo "mbox $mbox not found" >&2
  exit 1
fi

if [ ! -d $destdir ]; then
  echo "destination directory $destdir does not exist" >&2
  exit 1
fi

if [ $(basename $mbox) = $(basename $mbox .gz) ]; then
  cat=cat
else
  cat=zcat
fi

tempdir=$(mktemp -d)
export tempdir
$cat $mbox | formail -s sh -c 'cat - > $tempdir/msg.$FILENO'

for message in $tempdir/msg.*; do
  # Fetch the Message-Id.  Note: due to braindamage at Microsoft we need to
  # cope with messages having more than one Message-Id, so we loop here.
  formail -x Message-Id < $message | while read messageid; do
    # strip the initial " <", the trailing ">", and replace /'s with _
    messageid=${messageid##*([< ])}
    messageid=${messageid%%>}
    messageid=${messageid//\//_}
    # fetch the part after the first @ (inclusive), and split in two levels of
    # dirs of two chars.  So for @gmail.com we get gm/ai/@gmail.com
    dir=${messageid##*([^@])}
    dir=${dir:1:2}/${dir:3:2}/$dir
    # escape [, ? and * from it (these are special chars for bash ${} expansion)
    diresc=${dir/[/\\[}
    diresc=${diresc/\*/\\*}
    diresc=${diresc/\?/\\?}
    # and fetch the part before the @ (i.e. strip $dir), and split in two levels
    # of dirs of two chars
    file=${messageid%%$diresc}
    dir=$dir/${file:0:2}/${file:2:2}
    # create the directory if needed
    if [ ! -d "$destdir/$dir" ]; then
      mkdir -p "$destdir/$dir" || echo "failed to create $destdir/$dir"
    fi
    # and link the message into place
    if [ ! -f "$destdir/$dir/$file" ]; then
      ln $message "$destdir/$dir/$file" || echo "failed to link $message to $destdir/$dir/$file"
    fi
  done
  rm $message
done
rmdir $tempdir # unless something remained