/*
**	SWISH++
**	extract.c
**
**	Copyright (C) 1998  Paul J. Lucas
**
**	This program is free software; you can redistribute it and/or modify
**	it under the terms of the GNU General Public License as published by
**	the Free Software Foundation; either version 2 of the License, or
**	(at your option) any later version.
** 
**	This program is distributed in the hope that it will be useful,
**	but WITHOUT ANY WARRANTY; without even the implied warranty of
**	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
**	GNU General Public License for more details.
** 
**	You should have received a copy of the GNU General Public License
**	along with this program; if not, write to the Free Software
**	Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/

// standard
#include <algorithm>
#include <cstdlib>
#include <cstring>
#include <ctime>
#include <fstream>
#include <iomanip>
#include <iostream>
#include <string>
#include <sys/stat.h>
#include <sys/types.h>

// local
#include "config.h"
#include "directory.h"
#include "ext_proc.h"
#include "fake_ansi.h"
#include "file_vector.h"
#include "postscript.h"
#include "string_set.h"
#include "util.h"
#include "version.h"

extern "C" {
	extern char*	optarg;
	extern int	optind, opterr;
}

#ifndef	PJL_NO_NAMESPACES
using namespace std;
#endif

string_set	extensions;		// file extensions to index
char const*	me;			// executable name
int		num_files;
int		verbosity;		// how much to print

void		do_file( char const *path );
void		usage();

//*****************************************************************************
//
// SYNOPSIS
//
	int main( int argc, char *argv[] )
//
// DESCRIPTION
//
//	Parse the command line, initialize, call other functions ... the
//	usual things that are done in main().
//
// PARAMETERS
//
//	argc	The number of arguments.
//
//	argv	A vector of the arguments; argv[argc] is null.  Aside from
//		the options below, the arguments are the names of the files
//		and directories to extract.
//
// SEE ALSO
//
//	Stroustrup, Bjarne.  "The C++ Programming Language, 3rd ed."
//	Addison-Wesley, Reading, MA.  pp. 116-118.
//
//*****************************************************************************
{
	me = ::strrchr( argv[0], '/' );		// determine base name
	me = me ? me + 1 : argv[0];		// of executable

	/////////// Process command-line options //////////////////////////////

	::opterr = 1;
	for ( int opt;
		(opt = ::getopt( argc, (char**)argv, "e:lv:V" )) != EOF;
	)
		switch ( opt ) {

			case 'e': // Specify filename extension(s) to index.
				extensions.insert( ::optarg );
				break;

			case 'l': // Follow symbolic links during indexing.
				follow_symbolic_links = true;
				break;

			case 'v': // Specify verbosity level.
				verbosity = ::atoi( ::optarg );
				if ( verbosity < 0 )
					verbosity = 0;
				else if ( verbosity > 3 )
					verbosity = 3;
				break;

			case 'V': // Display version and exit.
				cout << "SWISH++ " << version << endl;
				::exit( 0 );

			case '?': // Bad option.
				usage();
		}

	argc -= ::optind, argv += ::optind;
	if ( !argc )
		usage();

	////////// Extract text from specified files //////////////////////////

	time_t time = ::time( 0 );
	while ( *argv ) {
		if ( is_directory( *argv ) )
			do_directory( *argv );
		else
			do_file( *argv );
		++argv;
	}

	if ( verbosity ) { 
		time = ::time( 0 ) - time;
		cout	<< setfill('0')
			<< "\nExtraction done:\n  "
			<< setw(2) << (time / 60) << ':'
			<< setw(2) << (time % 60)
			<< " elapsed time\n  "
			<< num_files << " files\n\n"
			<< setfill(' ');
	}

	return 0;
}

//*****************************************************************************
//
// SYNOPSIS
//
	void do_file( char const *file_name )
//
// DESCRIPTION
//
//	Extract the text.  This algorithm is loosely based on what the Unix
//	strings(1) command does except it goes a bit further to discard
//	things like Encapsulated PostScript and raw hex data.
//
// PARAMETERS
//
//	The file to extract text from.
//
//*****************************************************************************
{
	////////// Determine if we should process the file ////////////////////

	if ( !is_plain_file() ||
		is_symbolic_link( file_name ) && !follow_symbolic_links
	)
		return;

	//
	// Check to see if the file name has a '.' in it and that it is not the
	// last character.
	//
	char const *ext = ::strrchr( file_name, '.' );
	if ( !ext || !*++ext )
		return;

	//
	// If the candidate extension contains a '/', then it's really not an
	// extension; rather, it's a file name like: "/a.bizarre/file".
	//
	if ( ::strchr( ext, '/' ) )
		return;

	//
	// Determine if the file needs to be preprocessed first.
	//
	static ext_proc_map ext_procs;
	ext_proc_map::value_type const *const proc = ext_procs[ ext ];

	if ( proc ) {
		static char ext_buf[ 10 ];
		//
		// Get the "real" filename extension, e.g., get "txt" out of
		// "foo.txt.gz".
		//
		register char const *p;
		for ( p = ext - 2; p > file_name; --p )
			if ( *p == '.' ) {
				::copy( p + 1, ext - 1, ext_buf );
				ext_buf[ ext - p - 2 ] = '\0';
				break;
			}
		if ( *p != '.' ) {
			//
			// File doesn't have a "real" extension, e.g., it's
			// something like "bar.gz".
			//
			return;
		}
		static string fixed_file_name;
		fixed_file_name = string( file_name, ext - 1 );
		file_name = fixed_file_name.c_str();
		ext = ext_buf;
	}

	//
	// Skip the file if the set of acceptable extensions does not contain
	// the candidate.
	//
	if ( !extensions.find( ext ) )
		return;

	//
	// Check to see if the .txt file already exists; if so, skip it.
	//
	string const file_name_txt = string( file_name ) + ".txt";
	if ( ::stat( file_name_txt.c_str(), &stat_buf ) != -1 )
		return;
	ofstream txt( file_name_txt.c_str() );
	if ( !txt )
		return;

	if ( proc && !process_file( proc->undo, file_name ) )
		return;

	file_vector<char> file( file_name );
	if ( !file )
		return;

	if ( verbosity > 2 ) {			// print base name of file
		char const *const slash = ::strrchr( file_name, '/' );
		cout << "  " << ( slash ? slash + 1 : file_name ) << flush;
	}

	////////// Parse the file /////////////////////////////////////////////

	++num_files;

	char word_buf[ Word_Hard_Max_Size + 1 ];
	register char *word;
	int word_len;
	int num_words = 0;
	bool in_word = false;
	bool in_postscript = false;

	register file_vector<char>::const_iterator c = file.begin();
	while ( c != file.end() ) {
		register char ch = *c++;

		////////// Collect a word /////////////////////////////////////

		if ( is_word_char( ch ) || ch == '%' ) {
			if ( !in_word ) {
				// start a new word
				word = word_buf;
				word[ 0 ] = ch;
				word_len = 1;
				in_word = true;
				continue;
			}
			if ( word_len < Word_Hard_Max_Size ) {
				// continue same word
				word[ word_len++ ] = ch;
				continue;
			}
			in_word = false;		// too big: skip chars
			while ( c != file.end() && is_word_char( *c++ ) ) ;
			continue;
		}

		if ( !in_word )
			continue;

		////////// Got a word /////////////////////////////////////////

		in_word = false;
		if ( word_len < Word_Hard_Min_Size )
			continue;

		word[ word_len ] = '\0';

		//
		// Look for Encapsulated PostScript code and skip it.
		//
		if ( in_postscript ) {
			if ( !::strcmp( word, "%%Trailer" ) )
				in_postscript = false;
			continue;
		}
		static postscript_comment_set postscript_comments;
		if ( postscript_comments.find( word ) ) {
			in_postscript = true;
			continue;
		}

		static postscript_operator_set postscript_operators;
		if ( postscript_operators.find( word ) )
			continue;

		//
		// Strip chars not in Word_End_Chars from end of word.
		//
		for ( register int i = word_len - 1; i >= 0; --i ) {
			if ( !::strchr( Word_End_Chars, tolower( word[ i ] ) ) )
				--word_len;
			else
				break;
		}
		if ( word_len < Word_Hard_Min_Size )
			continue;

		word[ word_len ] = '\0';

		//
		// Strip chars not in Word_Begin_Chars from beginning of word.
		//
		for ( register char const *p = word; *p; ++p ) {
			if ( !::strchr( Word_Begin_Chars, tolower( *p ) ) &&
				*p != '%'
			)
				--word_len, ++word;
			else
				break;
		}
		if ( word_len < Word_Hard_Min_Size )
			continue;

		//
		// Discard what looks like ASCII hex data.
		//
		if ( word_len >= Word_Hex_Min_Size &&
			::strspn( word, "0123456789abcdefABCDEF" ) == word_len
		)
			continue;

		if ( !is_ok_word( word ) )
			continue;

		++num_words;
		txt << word << '\n';
	}

	if ( verbosity > 2 )
		cout << " (" << num_words << " words)" << endl;

	if ( proc )			// restore file to the way it was
		process_file( proc->redo, file_name );
}

//*****************************************************************************
//
//	Miscellaneous function(s)
//
//*****************************************************************************

void usage() {
	cerr <<	"usage: " << me << " [options] dir ... file ...\n"
		" options:\n"
		" --------\n"
		"  -e ext          : Extension to index\n"
		"  -l              : Follow symbolic links\n"
		"  -v verbosity    : Verbosity level [0-3]\n"
		"  -V              : Print version number and exit\n";
	::exit( 1 );
}
