/*************************************************************************/
/*                                                                       */
/*                Centre for Speech Technology Research                  */
/*                     University of Edinburgh, UK                       */
/*                       Copyright (c) 1996,1997                         */
/*                        All Rights Reserved.                           */
/*                                                                       */
/*  Permission to use, copy, modify, distribute this software and its    */
/*  documentation for research, educational and individual use only, is  */
/*  hereby granted without fee, subject to the following conditions:     */
/*   1. The code must retain the above copyright notice, this list of    */
/*      conditions and the following disclaimer.                         */
/*   2. Any modifications must be clearly marked as such.                */
/*   3. Original authors' names are not deleted.                         */
/*  This software may not be used for commercial purposes without        */
/*  specific prior written permission from the authors.                  */
/*                                                                       */
/*  THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK        */
/*  DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING      */
/*  ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT   */
/*  SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE     */
/*  FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES    */
/*  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN   */
/*  AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,          */
/*  ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF       */
/*  THIS SOFTWARE.                                                       */
/*                                                                       */
/*************************************************************************/
/*                                                                       */
/*                 Author: Paul Taylor                                   */
/*                   Date: 6 Jan 1998                                    */
/* --------------------------------------------------------------------- */
/*              UniSyn prosody manipulation functions                    */
/*                                                                       */
/*************************************************************************/

#include "UniSyn.h"


typedef
float (*local_cost_function)(const EST_Item *item1,
			     const EST_Item *item2);

bool dp_match(const EST_Relation &lexical,
	      const EST_Relation &surface,
	      EST_Relation &match,
	      local_cost_function lcf,
	      EST_Item *null_syl);

float local_cost(const EST_Item *s1, const EST_Item *s2)
{
    float insertion_cost = get_c_int(siod_get_lval("met_insertion", NULL));
    float deletion_cost = get_c_int(siod_get_lval("met_deletion", NULL));
    float substitution_cost = 
	get_c_int(siod_get_lval("met_substitution", NULL));

    EST_String null_sym = "nil";

    // otherwise cost is either insertion cost, or cost_matrix value
    if (s1->name() == s2->name())
	return 0;
    else
    {
	if (s1->name() == null_sym)
	    return insertion_cost;
	else if (s2->name() == null_sym)
	    return deletion_cost;
	else 
	    return substitution_cost;
    }
}

void pitchmarks_to_f0(EST_Track &pm, EST_Track &fz, float shift)
{
    int i;
    float period;

    fz.resize((int)(pm.end()/shift), 1);
    fz.fill_time(shift);

    for (i = 0; i < fz.num_frames() -1 ; ++i)
    {
	period = get_time_frame_size(pm, pm.index_below(fz.t(i)));
	fz.a(i) = 1.0 /period;
    }
}

/* Convert an F0 contour into a set of pitchmarks. This is done by the
obvious iterative function.

Space before the first defined F0 value is filled with regularly space
pitchmarks at intervals 1/def_f0. If the target_end value is
specified, more default pitchmarks are placed after the end of the
last f0 value until time target_end has been reached.
*/

void f0_to_pitchmarks(EST_Track &fz, EST_Track &pm, float target_end)
{
    int i;
    float max = 0.0, prev_pm = 0.0, val;
    const float def_f0 = 100.0;

//    cout << "fz end: " << fz.end() << endl;
//    cout << "fz n fg: " << fz.num_frames() << endl;

    // Its impossible to guess the length of the pitchmark array before 
    // hand. Here we find the upper limit and resize at the end
    for (i = 0; i < fz.num_frames(); ++i)
	if (fz.a(i) > max)
	    max = fz.a(i);

    pm.clear();
    pm.resize(int(max * (Gof(fz.end(), target_end))) + 10, 0);

//    cout << "fz end: " << fz.end() << endl;
//    cout << "fz n fg: " << fz.num_frames() << endl;
//    cout << "pmn fg: " << pm.num_frames() << endl;

    for (i = 0; prev_pm < fz.end(); ++i)
    {
	val = fz.a(prev_pm) > 0.0 ? fz.a(prev_pm) : def_f0;
	pm.t(i) = prev_pm + (1.0 / val);
	prev_pm = pm.t(i);
    }

    if (target_end > fz.end())
	for (; prev_pm < target_end; ++i)
	{
	    pm.t(i) = prev_pm + (1.0 / def_f0);
	    prev_pm = pm.t(i);
	}

    pm.resize(i - 1, 0);
}

void targets_to_pitchmarks(EST_Relation &targ, EST_Track &pitchmarks, 
			   int num_channels,float end)
{
    EST_Item *s;
    float time, f0, prev_time, prev_f0, m, max;
    int i;

    // Its impossible to guess the length of the pitchmark array before 
    // hand. Here we find the upper limit and resize at the end
    for (max = 0.0, s = targ.first_leaf(); s; s = next_leaf(s))
	if (s->fF("f0") > max)
	    max = s->fF("f0");

    pitchmarks.clear();
    pitchmarks.resize((int)(max * 1.1 * end)+1,num_channels);

    prev_time = 0;
    prev_f0 = targ.first_leaf() ? targ.first_leaf()->fF("f0") : 120;
    pitchmarks.t(0) = 0.0;

    for (i = 1, s = targ.first_leaf(); s; s = next_leaf(s))
    {
	time = s->f("pos");
	f0 = s->fF("f0");

	if (f0 < 30)  // to protect against with duff IntTarget algorithms
	    continue; 
	if (time == prev_time)
	    continue;
	else if (time < prev_time)
	{
	    cerr << "UniSyn: warning target in wrong order at " << prev_time;
	    cerr << " ignored" << endl;
	    continue;
	}
	m = (f0 - prev_f0) / (time - prev_time);

	for (; pitchmarks.t(i - 1) < time; ++i)
	{
	    f0 = (m * (pitchmarks.t(i - 1) - prev_time)) + prev_f0;
	    pitchmarks.t(i) = pitchmarks.t(i - 1) + 1.0/f0;
	}
	prev_time = time;
	prev_f0 = f0;
    }
    // Ensure pitch marks go to the end of the utterance
    // This will effectively mean the last half diphone will be extend over
    // the whol final segment.  This will only be reasonable if the
    // final segment is a silence.
    for (; pitchmarks.t(i - 1) < end; ++i)
	pitchmarks.t(i) = pitchmarks.t(i - 1) + 1.0/prev_f0;
    pitchmarks.resize(i, pitchmarks.num_channels());
}    

void targets_to_f0(EST_Relation &targ, EST_Track &f0, const float shift)
{
    EST_Item *s;
    float prev_f0, prev_pos, m;
    int i;

    f0.clear();
    f0.resize(int(ceil(targ.last_leaf()->fF("pos") / shift)), 1);
    f0.fill_time(shift);

    s = targ.head();

    // fill with zeros until first target;
    for (i = 0; i < f0.num_frames(); ++i)
    {
	if (f0.t(i) > s->fF("pos"))
	    break;
	f0.a(i) = 0.0;
    }

    prev_pos = s->fF("pos");
    prev_f0 = s->fF("f0");

    s = next_leaf(s);

    for (m=0.0,i = 0; i < f0.num_frames(); ++i)
    {
	if (f0.t(i) > s->fF("pos"))
	{
	    prev_pos = s->fF("pos");
	    prev_f0 = s->fF("f0");
	    s = next_leaf(s);
	    if (s == 0)
		break;
	    m = (s->fF("f0") - prev_f0)/ (s->fF("pos") - prev_pos);
	}
	f0.a(i) = (m * (f0.t(i) - prev_pos)) + prev_f0;
    }
}    

void linear_pitchmarks(EST_Track &source_pm, EST_Track &target_pm, 
		       float start_f0, float end_f0)
{
    int i;
    float m, length, pitch;
    target_pm.resize(source_pm.num_frames(), source_pm.num_channels());

    length = (float)source_pm.num_frames() / (end_f0 - start_f0);

    target_pm.t(0) = 0.0;
    m = (end_f0 - start_f0) / length;

    for(i = 1; i < target_pm.num_frames(); ++i)
    {
	pitch = (((float)i / (float) target_pm.num_frames())
		 * (end_f0 - start_f0)) + start_f0;
	target_pm.t(i) = target_pm.t(i - 1) + (1 /pitch);
    }
}    
	      
void stretch_f0_time(EST_Track &f0, float stretch, 
		     float s_last_time, float t_last_time)
{
    for (int i = 0 ; i < f0.num_frames(); ++i)
    {
//	cout << i << " o t:" << f0.t(i) << endl;
	f0.t(i) = ((f0.t(i) - s_last_time) * stretch) + t_last_time;
//	cout << i << " m t:" << f0.t(i) << endl;
    }
}

// make target F0 from source F0, with same F0 values as original,
// but durations specified by target_seg.

void warp_f0(EST_Track &source_f0, EST_Relation &source_seg,
	     EST_Track &target_f0, EST_Relation &target_seg)
{
    EST_Item *s, *t;
    float prev_source_end = 0.0, prev_target_end = 0.0;
    EST_Track part;
    int frame_start, frame_end;
    float stretch, t_last_time = 0, s_last_time = 0;
    EST_Relation match("Match");
    EST_Item xx;
    EST_Track str;
    int i = 0;

    dp_match(target_seg, source_seg, match, local_cost, &xx);

    target_f0 = source_f0;
    frame_start = 0;
    frame_end = 0;

    str.resize(target_seg.length(), 1);

    cout << "tag: " << target_seg << endl;

    for (t = target_seg.head(); t; t = next(t))
    {
	s = t->as_relation("Match")->daughter1();
	if (s == 0) // ie extra phone in target specification
	    continue;

	frame_end = source_f0.index(s->f("end"));
	if ((frame_end - frame_start) < 1)
	{
	    cout << "Warning no frames for: " << *t << endl;
	    continue;
	}
	target_f0.sub_track(part, frame_start, (frame_end - frame_start + 1));

	stretch = (t->fF("end") - prev_target_end) / 
		   (s->fF("end") - prev_source_end);

	str.a(i) = stretch;
	str.t(i++) = t->fF("end");

	cout << "\nstretch: " << stretch << endl;
	cout << "source: " << *s << endl;
	cout << "target: " << *t << endl;
	cout << "frames: " << frame_start << " " << frame_end << endl;

	stretch_f0_time(part, stretch, s_last_time, t_last_time);

	prev_target_end = t->f("end");
	prev_source_end = s->f("end");
	frame_start = frame_end + 1;
	t_last_time = part.end();
	s_last_time = source_f0.t(frame_end);
	cout << "last time = " << s_last_time << " " << t_last_time << endl;
    }
    target_f0.resize(frame_end, 1);
    target_f0.a(target_f0.num_frames() - 1) = 100;
    str.save("zz_stretch");
}

void warp_pitchmarks(EST_Utterance &utt, EST_Track *source_pm, 
		    EST_Relation &source_seg, EST_Relation &target_seg)
{
    EST_Track source_f0, target_f0, *target_pm;

    target_pm = new EST_Track;

    cout << "tag: "<< target_seg << endl;

    add_end_silences(target_seg);


    cout << "tag 2: "<< target_seg << endl;

    pitchmarks_to_f0(*source_pm, source_f0, 0.01);

    cout << "tag 3: "<< target_seg << endl;

    warp_f0(source_f0, source_seg, target_f0, target_seg);

    f0_to_pitchmarks(target_f0, *target_pm);

    utt.create_relation("TargetCoef");
    utt.create_relation("SourceSegments");

    *utt.relation("SourceSegments") = source_seg;

    EST_Item *item = utt.relation("TargetCoef")->append();

    target_f0.save("tt_tar.f0", "est");
    target_seg.save("tt_tar.lab");
    source_seg.save("tt_sou.lab");
    source_f0.save("tt_sou.f0", "est");

    target_pm->save("target_coef_a.pm","est");
    item->fset("name", "coefs");
    item->fset("coefs", (void *)(target_pm), gc_track);
}

void us_F0targets_to_pitchmarks(EST_Utterance &utt)
{
    utt.create_relation("TargetCoef");
    EST_Track *target_coef = new EST_Track;
    EST_Item *end_seg;
    int num_channels = 0;
    float end;

    if (utt.has_relation("SourceCoef"))
    {
	EST_Track *source_coef = 
	    (EST_Track *)utt.relation("SourceCoef")->head()->fP("coefs");
	num_channels = source_coef->num_channels();
    }

    end_seg = utt.relation("Segment")->last();
    if (end_seg)
	end = end_seg->fF("end");
    else
	end = 0;

    targets_to_pitchmarks(*(utt.relation("Target")), *target_coef, 
			  num_channels,end);

    EST_Item *item = utt.relation("TargetCoef")->append();
    item->fset("name", "coef");
    item->fset("coefs", (void *)(target_coef), gc_track);
}

