/*************************************************************************/
/*                                                                       */
/*                Centre for Speech Technology Research                  */
/*                     University of Edinburgh, UK                       */
/*                      Copyright (c) 1996,1997                          */
/*                        All Rights Reserved.                           */
/*                                                                       */
/*  Permission to use, copy, modify and distribute this software and its */
/*  documentation for research, educational and individual use only, is  */
/*  hereby granted without fee, subject to the following conditions:     */
/*   1. The code must retain the above copyright notice, this list of    */
/*      conditions and the following disclaimer.                         */
/*   2. Any modifications must be clearly marked as such.                */
/*   3. Original authors' names are not deleted.                         */
/*  This software may not be used for commercial purposes without        */
/*  specific prior written permission from the authors.                  */
/*                                                                       */
/*  THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK        */
/*  DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING      */
/*  ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT   */
/*  SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE     */
/*  FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES    */
/*  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN   */
/*  AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,          */
/*  ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF       */
/*  THIS SOFTWARE.                                                       */
/*                                                                       */
/*************************************************************************/
/*                     Author :  Alan W Black                            */
/*                     Date   :  October 1997                            */
/*-----------------------------------------------------------------------*/
/*  A program for testing a CART tree against data, also may be used to  */
/*  predict values using a tree and data                                 */
/*                                                                       */
/*=======================================================================*/
#include <stdlib.h>
#include <iostream.h>
#include <fstream.h>
#include <string.h>
#include "EST_Wagon.h"
#include "EST_cutils.h"
#include "EST_multistats.h"
#include "EST_Token.h"
#include "EST_cmd_line.h"

static int wagon_test_main(int argc, char **argv);
static LISP find_feature_value(char *feature, LISP vector, LISP description);
static LISP wagon_vector_predict(LISP tree, LISP vector, LISP description);
static LISP get_data_vector(EST_TokenStream &data, LISP description);
static void simple_predict(EST_TokenStream &data, FILE *output, 
			   LISP tree, LISP description, int all_info);
static void test_tree_class(EST_TokenStream &data, FILE *output, 
			    LISP tree, LISP description);
static void test_tree_float(EST_TokenStream &data, FILE *output, 
			    LISP tree, LISP description);

int main(int argc, char **argv)
{

    wagon_test_main(argc,argv);

    exit(0);
    return 0;
}

static int wagon_test_main(int argc, char **argv)
{
    // Top level function sets up data and creates a tree
    EST_Option al;
    EST_StrList files;
    LISP description,tree=NIL;;
    EST_TokenStream data;
    FILE *wgn_output;

    parse_command_line(argc, argv,
       EST_String("Usage:\n")+
       "wagon_test  <options>\n"+
       "program to test CART trees on data\n"+
       "-desc <ifile>     Field description file\n"+
       "-data <ifile>     Datafile, one vector per line\n"+
       "-tree <ifile>     File containing CART tree\n"+
       "-predict          Predict for each vector returning distribution\n"+
       "-predict_val      Predict for each vector returning just value\n"+
       "-heap <int> {210000}\n"+
       "              Set size of Lisp heap, should not normally need\n"+
       "              to be changed from its default\n"+
       "-o <ofile>        File to save output in\n",
		       files, al);

    if (al.present("-v"))
    {
	printf("%s: %s\n",argv[0],wagon_version);
	exit(0);
    }

    siod_init(al.ival("-heap"));

    if (al.present("-desc"))
    {
	gc_protect(&description);
	description = car(vload(al.val("-desc"),1));
    }
    else
    {
	cerr << argv[0] << ": no description file specified" << endl;
	exit(-1);
    }

    if (al.present("-tree"))
    {
	gc_protect(&tree);
	tree = car(vload(al.val("-tree"),1));
	if (tree == NIL)
	{
	    cerr << argv[0] << ": no tree found in \"" << al.val("-tree")
		<< "\"" << endl;
	    exit(-1);
	}
    }
    else
    {
	cerr << argv[0] << ": no tree file specified" << endl;
	exit(-1);
    }

    if (al.present("-data"))
    {
	if (data.open(al.val("-data")) != 0)
	{
	    cerr << argv[0] << ": can't open data file \"" << 
		al.val("-data") << "\" for input." << endl;
	    exit(-1);
	}
    }
    else
    {
	cerr << argv[0] << ": no data file specified" << endl;
	exit(-1);
    }

    if (al.present("-o"))
    {
	if ((wgn_output = fopen(al.val("-o"),"w")) == NULL)
	{
	    cerr << argv[0] << ": can't open output file \"" <<
		al.val("-o") << "\"" << endl;
	}
    }
    else
	wgn_output = stdout;

    if (al.present("-predict"))
	simple_predict(data,wgn_output,tree,description,FALSE);
    else if (al.present("-predict_val"))
	simple_predict(data,wgn_output,tree,description,TRUE);
    else if (streq(get_c_string(car(cdr(car(description)))),"float") ||
	     streq(get_c_string(car(cdr(car(description)))),"int"))
	test_tree_float(data,wgn_output,tree,description);
    else
	test_tree_class(data,wgn_output,tree,description);

    if (wgn_output != stdout)
	fclose(wgn_output);
    data.close();
    return 0;
}

static LISP get_data_vector(EST_TokenStream &data, LISP description)
{
    // read in one vector.  Should be terminated with an newline
    LISP v=NIL,d;

    if (data.eof())
	return NIL;

    for (d=description; d != NIL; d=cdr(d))
    {
	EST_Token t = data.get();
	
	if ((d != description) && (t.whitespace().contains("\n")))
	{
	    cerr << "wagon_test: unexpected newline within vector " <<
		t.row() << " wrong number of features" << endl;
	    siod_error();
	}
	if (streq(get_c_string(car(cdr(car(d)))),"float") ||
	    streq(get_c_string(car(cdr(car(d)))),"int"))
	    v = cons(flocons(atof(t.string())),v);
	else
	    v = cons(strintern(t.string()),v);
    }

    return reverse(v);
}

static void simple_predict(EST_TokenStream &data, FILE *output, 
			   LISP tree, LISP description, int all_info)
{
    LISP vector,predict;
    EST_String val;

    for (vector=get_data_vector(data,description); 
	 vector != NIL; vector=get_data_vector(data,description))
    {
	predict = wagon_vector_predict(tree,vector,description);
	if (all_info)
	    val = siod_sprint(car(reverse(predict)));
	else
	    val = siod_sprint(predict);
	fprintf(output,"%s\n",(const char *)val);
    }
}

static void test_tree_float(EST_TokenStream &data, FILE *output, 
			    LISP tree, LISP description)
{
    // Test tree against data to get summary of results FLOAT
    float predict_val,real_val;
    EST_SuffStats x,y,xx,yy,xy,se,e;
    double cor,error;
    LISP vector,predict;

    for (vector=get_data_vector(data,description); 
	 vector != NIL; vector=get_data_vector(data,description))
    {
	predict = wagon_vector_predict(tree,vector,description);
	predict_val = get_c_float(car(reverse(predict)));
	real_val = get_c_float(car(vector));
	x += predict_val;
	y += real_val;
	error = predict_val-real_val;
	se += error*error;
	e += fabs(error);
	xx += predict_val*predict_val;
	yy += real_val*real_val;
	xy += predict_val*real_val;
    }

    cor = (xy.mean() - (x.mean()*y.mean()))/
	(sqrt(xx.mean()-(x.mean()*x.mean())) *
	 sqrt(yy.mean()-(y.mean()*y.mean())));

    fprintf(output,";; RMSE %1.4f Correlation is %1.4f Mean (abs) Error %1.4f (%1.4f)\n",
	    sqrt(se.mean()),
	    cor,
	    e.mean(),
	    e.stddev());
}

static void test_tree_class(EST_TokenStream &data, FILE *output, 
			    LISP tree, LISP description)
{
    // Test tree against class data to get summary of results
    EST_StrStr_KVL pairs;
    EST_StrList lex;
    EST_String predict_class,real_class;
    LISP vector,w,predict;
    (void)output;

    for (vector=get_data_vector(data,description); 
	 vector != NIL; vector=get_data_vector(data,description))
    {
	predict = wagon_vector_predict(tree,vector,description);
	predict_class = get_c_string(car(reverse(predict)));
	real_class = get_c_string(car(vector));
	pairs.add_item(real_class,predict_class,1);
    }
    for (w=cdr(car(description)); w != NIL; w = cdr(w))
	lex.append(get_c_string(car(w)));

    const EST_FMatrix &m = confusion(pairs,lex);
    print_confusion(m,pairs,lex);
    
}

static LISP wagon_vector_predict(LISP tree, LISP vector, LISP description)
{
    // Using the LISP tree, vector and description, do standard prediction

    if (cdr(tree) == NIL)
	return car(tree);

    LISP value = find_feature_value(wgn_ques_feature(car(tree)),
				    vector, description);
    
    if (wagon_ask_question(car(tree),value))
	// Yes answer
	return wagon_vector_predict(car(cdr(tree)),vector,description);
    else 
	// No answer
	return wagon_vector_predict(car(cdr(cdr(tree))),vector,description);
}

static LISP find_feature_value(char *feature, LISP vector, LISP description)
{
    LISP v,d;

    for (v=vector,d=description; v != NIL; v=cdr(v),d=cdr(d))
	if (streq(feature,get_c_string(car(car(d)))))
	    return car(v);

    cerr << "wagon_test: can't find feature \"" << feature <<
	"\" in description" << endl;
    siod_error();
    return NIL;

}
