#!/bin/sh
# gbdummyfy ver. .5   Copyright (C) 2009-2012 Giulio Bottazzi

#default settings
pos=1
del="no"

#read command line options; the position of the last option is saved
#in OPTIND
while getopts "c:d:hv-:" opt
do
    case $opt in
	-)
	    case "${OPTARG}" in
		help) help=yes;;
		version) version=yes;;
	    esac;;
	c) pos=$OPTARG ;;
	d) del=$OPTARG ;;
	v) verbose=yes ;;
	h) help=yes ;;
	\?) help=yes;;	
    esac
done

if [ "$version" = "yes" ]; then

    cat - <<EOF
gbdummyfy ver. 5.6

Copyright (C) 2009-2015 Giulio Bottazzi

This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
(version 2) as published by the Free Software Foundation.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

Written by Giulio Bottazzi

Report bugs to <gbutils@googlegroups.com>

Package home page <http://cafim.sssup.it/~giulio/software/gbutils/index.html>
EOF

  exit
fi

if [ "$help" = "yes" ]; then
    
    cat - <<EOF
This command reads from standard input a text file with space
separated columns. The entry in one column (the first by default) are
considered labels and expanded into a matrix of dummies , i.e. of 0
and 1 values. The number of columns of the matrix is equal to the
number of different labels. Each row contains '1' in the place of the
associated labels in the sorted list of labels, and '0' everywehere
else. Since in general one less dummy variable is required than the
number of labels, you can remove one column of dummies using the
option '-d'.

Usage:  gbdummyfy [options]

Options:
 -h  print this help
 -c  set the column of labels (default 1)
 -d  which column to remove, counting from 1 (default none)
 -v  print the labels and associated positions to standard error

Examples:
 echo "a 1\nb 2" | gbdummyfy  create a 4x3 marix with dummy values
                              relative to labels 'a' and 'b'

This program requires awk or gawk. Notice that it simply expands the
data adding new columns. When using the resulting the resulting matrix
in other utilities, the user should specify explicitly which dummies
variable to use and how.

A simple linear dependency can be automatically generated for 'gblreg'
by inserting the following expression in the functional specification

\`seq 3 12 | sed 's/\(.*\)/\+d\1\*x\1/' | tr -d '\n'\`

and

\`seq 3 12 | sed 's/\(.*\)/,\1=0/' | tr -d '\n'\`

among the initial conditions. In this case there are 10 different values
for the dummy. They occupy column positions from 3 to 12 and their initial
value is zero.
EOF

    exit

fi

#create temporary files
dataorig=`tempfile`
initial=`tempfile`
final=`tempfile`
dummies=`tempfile`
newdummies=`tempfile`

#store the original data removing possible initial empty spaces and
#shrinking any number of spaces and tab in one single space
cat - | tr '\t' ' ' | tr -s ' ' | sed 's/^ *//' > $dataorig

#count the number of columns
colnum=`head -n 1 $dataorig | gawk '{print NF}'`

#separate the input file in three parts: before the column of dummies
if [ $pos -gt 1 ]; then
    cut -d ' ' -f 1-$(( pos-1 )) < $dataorig > $initial
fi
#the column of dummies itself
cut -d ' ' -f $pos < $dataorig > $dummies
#after the column of dummies
if [ $pos -lt $colnum ]; then
    cut -d ' ' -f $(( pos+1 ))- < $dataorig > $final
fi

awk -v verbose=$verbose  '

{

    #store the dummy of the line
    line[NR]=$1
    # prepare the list of labels
    label[$1]=1
}

END {
    #sort and collect labels
    labnum=asorti(label,labsort)
    #print labels if verbose
    if(verbose ~ "yes")
	for(i=1;i<=labnum;i++)
	    print labsort[i],i  | "cat 1>&2"
    #prepare the array
    for(i=1;i<=labnum;i++){
	#create the string
	string=""
	for(j=1;j<i;j++)
	    string=string"0 "
	string=string"1 "
	for(j=i;j<labnum;j++)
	    string=string"0 "
	#assign it to the appropriate label
	label[labsort[i]]=string
    }
    #print output
    for(i=1;i<=NR;i++)
	print label[line[i]]
}

' $dummies > $newdummies

#delete a column
if [ "$del" != "no" ]; then

    labnum=`head -n 1 $newdummies | gawk '{print NF}'`

    if [ $del = 1 ]; then
	storage=`tempfile`
	cat $newdummies > $storage
	cut -d ' ' -f 2- < $storage > $newdummies
	rm $storage
    elif [ $del = $labnum ]; then
	storage=`tempfile`
	cat $newdummies > $storage
	cut -d ' ' -f 1-$(( labnum-1 )) < $storage > $newdummies
	rm $storage
    elif test $del -gt 1 && test $del -lt $labnum; then
	storage1=`tempfile`
	storage2=`tempfile`
	cut -d ' ' -f 1-$(( del-1 )) < $newdummies > $storage1
	cut -d ' ' -f $(( del+1 ))- < $newdummies > $storage2
	paste -d ' ' $storage1 $storage2 > $newdummies
	rm $storage1 $storage2
    else
	echo "gbdummyfy: wrong column spec in option -d; request ignored" > /dev/stderr
    fi

fi

#rebuild the file
if [ "$colnum" = 1 ]; then
    cat $newdummies
elif [ "$pos" = 1 ]; then
    paste -d ' ' $newdummies $final
elif [ "$pos" = "$colnum" ]; then
    paste -d ' ' $initial $newdummies
else
    paste -d ' ' $initial $newdummies $final
fi

#remove temporary files
rm $dataorig $initial $final $dummies $newdummies
