#!/bin/sh

if [ "$#" -eq 0 ]
then
  echo "Must supply path to archive files"
  exit 1
fi

archive="$1"
shift

native="$archive"

osname=`uname -s | sed -e 's/_NT-.*$/_NT/; s/^MINGW[0-9]*/CYGWIN/'`
if [ "$osname" = "CYGWIN_NT" -a -x /bin/cygpath ]
then
  archive=`cygpath -w "$archive"`
fi

archive=${archive%/}
native=${native%/}

deleteCitations() {
  inp="$1"
  pmidlist=.TO-REPORT
  delenda=.TO-DELETE
  cat "$inp" |
  xtract -pattern DeleteCitation -block PMID -tab "\n" -sep "." -element "PMID" |
  sort -n | uniq > $pmidlist
  cat "$pmidlist" |
  rchive -trie -gzip |
  sort -n | uniq > $delenda
  if [ -s $delenda ]
  then
    (cd "$native" && xargs rm -f) < $delenda
  fi
  if [ -s $pmidlist ]
  then
    cat "$pmidlist" >> "$native/deleted.uid"
  fi
  rm $pmidlist
  rm $delenda
}

reportVersioned() {
  inp="$1"
  pmidlist=.TO-REPORT
  xtract -input "$inp" -pattern PubmedArticle \
    -block MedlineCitation/PMID -if "@Version" -gt 1 -element "PMID" |
  sort -n | uniq > $pmidlist
  if [ -s $pmidlist ]
  then
    cat "$pmidlist" >> "$native/versioned.uid"
  fi
  rm $pmidlist
}

rm -f "versioned.xml.gz"
rm -f "versioned.snt"

for fl in *.xml.gz
do
  base=${fl%.xml.gz}
  if [ -f "$base.snt" ]
  then
    continue
  fi
  secnds_start=$(date "+%s")
  echo "$base.xml"
  gunzip -c "$fl" |
  xtract -compress -strict -head "<PubmedArticleSet>" -tail "</PubmedArticleSet>" \
    -pattern "PubmedArticleSet/*" -format > "$base.xml"
  rchive -gzip -input "$base.xml" -archive "$archive" \
    -index MedlineCitation/PMID^Version -pattern PubmedArticle
  deleteCitations "$base.xml"
  reportVersioned "$base.xml"
  touch "$base.snt"
  rm "$base.xml"
  secnds_end=$(date "+%s")
  secnds=$((secnds_end - secnds_start))
  echo "$secnds seconds"
done
