#include <time.h>
#include "UnigramTextClassifier.h"
using namespace std;
using namespace TextClassifier;
UnigramTextClassifier::UnigramTextClassifier()
{
my_total = 0;
my_corpus_total = 0;
my_classification = "Unknown";
}
UnigramTextClassifier::UnigramTextClassifier(const string classification)
{
my_total = 0;
my_corpus_total = 0;
my_classification = classification;
}
void UnigramTextClassifier::learn(istream& in)
{
char ch; while (in.get(ch))
{
my_freqs[ch]++;
my_corpus_total++;
}
}
void UnigramTextClassifier::learn(char* fname)
{
ifstream instr(fname);
learn(instr);
instr.close();
}
void UnigramTextClassifier::dump(ostream& out)
{
frequency_map m = this->freqs();
long tot = this->corpus_total();
out << "// UnigramTextClassifier created this file from " << tot
<< " characters. It was created on " + ctime_string() << ".\n";
out << "Classification\t" << this->classification() << endl;
out << "Total\t" << tot << endl;
for (frequency_map::iterator it=m.begin(); it != m.end(); it++)
{
out << (unsigned int)it->first << "\t" << it->second << "\t"
<< info_value((float)(it->second)/tot) << endl;
}
}
void UnigramTextClassifier::dump(char* fname)
{
ofstream instr(fname);
dump(instr);
instr.close();
}
void UnigramTextClassifier::read(istream& in)
{
string classification;
unsigned char ch;
long frequency;
char text[256];
in.getline(text,256,'\n'); in.getline(text,256,'\t'); in.getline(text,256,'\n'); classification = (string)text;
this->setClassification(classification);
in.getline(text,256);
while (true)
{
in.getline(text,256,'\t');
if (text[0]==0) break;
ch = (unsigned char)atoi(text);
in.getline(text,256,'\t');
frequency = atol(text);
my_freqs[ch] = frequency;
my_corpus_total+= frequency;
in.getline(text,256); }
}
void UnigramTextClassifier::read(char* fname)
{
ifstream instr(fname);
read(instr);
instr.close();
}
float UnigramTextClassifier::score (istream& in)
{
float br = bits_required(in);
long tot = total()*8;
float sc = 1 - br/tot;
return (sc >= 0 ? sc : 0.0) ;
}
float UnigramTextClassifier::score(char* fname)
{
ifstream instr(fname);
float sc = score(instr);
instr.close();
return sc;
}
float UnigramTextClassifier::bits_required(unsigned char ch)
{
frequency_map m = this->freqs();
long tot = this->corpus_total();
long frequency = ((!m[ch]) ? 1 : m[ch]) ;
return info_value((float)frequency/tot);
}
float UnigramTextClassifier::bits_required(istream& in)
{
char ch; float bits = 0;
frequency_map tfreq;
while (in.get(ch))
{
tfreq[(unsigned char)ch]++;
my_total++;
}
for (frequency_map::iterator it=tfreq.begin(); it != tfreq.end(); it++)
{
bits += bits_required(it->first) * it->second;
}
return bits;
}
float UnigramTextClassifier::bits_required(char* fname)
{
ifstream instr(fname);
return bits_required(instr);
instr.close();
}
float UnigramTextClassifier::lg(float n)
{
const float log2 = log(2.0);
return log(n)/log2;
}
float UnigramTextClassifier::info_value(float n)
{
return -lg(n);
}
string UnigramTextClassifier::ctime_string()
{
time_t rawtime;
struct tm * timeinfo;
time (&rawtime);
timeinfo = localtime(&rawtime);
char tmpbuf[128];
strftime(tmpbuf,128,"%Y-%m-%d %H:%M:%S",timeinfo);
return string(tmpbuf);
}