#!/usr/bin/perl


$fn = $ARGV[0];
$mem = 2; # Amount of "context" used to choose next token (eg, 2 means choose 3-tuples based on 2-tuples)
$sep = "|"; # Internal separator character; should appear minimally, or not at all, in text
$par = "@";
$punc = "[,.?!;:]\$";

open(F, $fn);

if (! -e F) {
    print "File does not exist\n";
}
elsif (! -r F) {
    print "File cannot be read\n";
}
else {
    $lim = "";
    @tokens = ();
    for ($i = 0; $i < $mem; $i++) { push(@tokens, $par) };

    # GATHER TOKENS
    $prev_par = 1;
    while ($line = <F>) {
	chop($line);
	$line =~ s/\s+/ /g;
        $line =~ s/$sep//g;
        $line =~ s/^ //g;
        $line =~ s/ $//g;
	if ($line =~ /^$/) {
	    if (0 == $prev_par) {
                for ($i = 0; $i < $mem; $i++) { push(@tokens, $par) };
		$prev_par = 1;
	    }
	}
	else {
	    $prev_par = 0;
            @words = split(/ /, $line);
	    foreach $w (@words) {
	        $punctuation = "";
	        if ($w =~ /$punc/) { $punctuation = chop($w); }
	        push(@tokens, $w);
	        if ("" ne $punctuation) { push(@tokens, $punctuation); }
	    }
	}
    }
    push(@tokens, $par);
    close(F);


    # FIRST PASS --- FIND TOTAL NUMBERS OF TOKEN SEQUENCES
    for ($i = $mem; $i < scalar @tokens; $i++) {
	$pre = "";
	for ($j = 0; $j < $mem; $j++) { $pre .= $tokens[$i + $j - $mem] . $sep };
	$tuples_raw{$pre}{$tokens[$i]}++;
	$tuples_cnt{$pre}++;
    }


    # SECOND PASS --- FIND PROBABILITIES OF TOKEN SEQUENCES
    @tuples_pre = keys(%tuples_raw);
    foreach $pre (@tuples_pre) { 
	$prob = 0;
        $denom = $tuples_cnt{$pre};
	foreach $word (keys(%{ $tuples_raw{$pre} })) {
	    $num = $tuples_raw{$pre}{$word};
	    $prob += $num / $denom;
	    push(@{ $tuples{$pre} }, {"prob" => $prob, "word" => $word});
	}
    }


    @ring = ();
    $pos = 0;
    while ($pos < $mem) {
	push(@ring, $par);
	$pos++;
    }

    # GENERATE THE NEW TEXT
    do {
	$r = rand();
	$pre = "";
	for ($i = 0; $i < $mem; $i++) { $pre .= $ring[($pos + $i - $mem) % ($mem + 1)] . $sep; }
	for ($i = 0; $i < scalar @{ $tuples{$pre} }; $i++) {
	    %candidate = %{ $tuples{$pre}[$i] };
	    if ($r <= $candidate{"prob"}) {
		$word = $candidate{"word"};
		last;
	    }
	}
	print ((($word =~ /$punc/) ? "" : " ") . $word);
	$ring[$pos % ($mem + 1)] = $word;
	$pos++;
    } while ($word ne $par);

    print "\n";
}
