#include <ucto_tokenizer_mod.h>

Public Member Functions
	UctoTokenizer (TiCC::LogStream , TiCC::LogStream =0)

	~UctoTokenizer ()

bool	init (const TiCC::Configuration &)

void	setUttMarker (const std::string &)

void	setPassThru (bool)

void	set_TC_debug (bool)

bool	getPassThru () const

void	setSentencePerLineInput (bool)

void	setInputEncoding (const std::string &)

void	setQuoteDetection (bool)

void	setInputXml (bool)

void	setFiltering (bool)

void	setInputClass (const std::string &)

void	setOutputClass (const std::string &)

void	setDocID (const std::string &)

void	setTextRedundancy (const std::string &)

void	setWordCorrection (bool)

std::string	get_data_version () const

std::string	default_language () const

bool	get_setting_info (const std::string &, std::string &, std::string &) const

std::vector< std::string >	tokenize (const std::string &)

std::vector< Tokenizer::Token >	tokenize_line (const std::string &, const std::string &="")

std::vector< Tokenizer::Token >	tokenize_line_next ()

std::vector< Tokenizer::Token >	tokenize_stream (std::istream &)

std::vector< Tokenizer::Token >	tokenize_stream_next ()

std::string	tokenizeStream (std::istream &)

std::vector< folia::Word * >	add_words (folia::Sentence *, const frog_data &) const

void	add_provenance (folia::Document &, folia::processor *) const

std::vector< Tokenizer::Token >	correct_words (folia::FoliaElement , const std::vector< folia::Word > &)

Constructor & Destructor Documentation

◆ UctoTokenizer()

UctoTokenizer::UctoTokenizer	(	TiCC::LogStream *	err_log,
		TiCC::LogStream *	dbg_log = `0`
	)

explicit

Created a (yet UNINITIALIZED) Tokenizer

Parameters

err_log	A LogStream for error messages
dbg_log	A LogStream for debugging

UctoTokenizer::init() needs to be called to get really going

◆ ~UctoTokenizer()

UctoTokenizer::~UctoTokenizer ( )

Destroy the Tokenizer

Member Function Documentation

◆ add_provenance()

void UctoTokenizer::add_provenance	(	folia::Document &	doc,
		folia::processor *	main
	)		const

add provenance information for the tokenizer. (FoLiA output only)

Parameters

doc	the FoLiA document to add to
main	the processor to use (presumably the Frog processor)

◆ add_words()

vector< folia::Word * > UctoTokenizer::add_words	(	folia::Sentence *	s,
		const frog_data &	fd
	)		const

create a list of folia::Word elements under a folia::Sentence

Parameters

s	The parent to attach too
fd	The frog_data structure with the needed information

Returns: a list of newly created folia::Word elements

this function should be used when creating FoLiA output, and it assumes that the tokenizer allready filled in all required fields in the frog_data structure

◆ correct_words()

vector< Tokenizer::Token > UctoTokenizer::correct_words	(	folia::FoliaElement *	elt,
		const std::vector< folia::Word * > &	wv
	)

correct Word elements in the FoLiA based on results found by the tokenizer

Parameters

elt	the FoliaElement which is the parent for correction
wv	The input Word vector, (of which elt is the parent)

the input Word vector might represent a 'word' like "gisteren?". The tokenizer will split this into "gisteren" and "?" and this function will handle this by creating a correction with 2 words as <new>

◆ default_language()

string UctoTokenizer::default_language ( ) const

return the default language of the tokenizer

◆ get_data_version()

string UctoTokenizer::get_data_version ( ) const

returns the version of the uctodata files we use

◆ get_setting_info()

bool UctoTokenizer::get_setting_info	(	const std::string &	lang,
		std::string &	name,
		std::string &	version
	)		const

get information about the current settings for a language

Parameters

lang	The language to examine
name	The name of the settings file used for lang
version	The version of the settingsfile

◆ getPassThru()

bool UctoTokenizer::getPassThru ( ) const

get the value of the PassThru setting

◆ init()

bool UctoTokenizer::init ( const TiCC::Configuration & config )

initalize a Tokenizer using a Configuration structure

Parameters

config the Configuration to use

Returns: true on success, false otherwise

this function sets up an Ucto tokenizer with some defaults and the values from config.

◆ set_TC_debug()

void UctoTokenizer::set_TC_debug ( bool b )

set the tokenizer TC_debug property

Parameters

b	a boolean, true to set to ON or OFF respectively

◆ setDocID()

void UctoTokenizer::setDocID ( const std::string & id )

set the tokenizer DocID value (for FoLiA)

Parameters

id	a string holding the document id. e.g. "document_1"

◆ setFiltering()

void UctoTokenizer::setFiltering ( bool b )

set the tokenizer Filtering property

Parameters

b	a boolean, true to set to ON or OFF respectively

◆ setInputClass()

void UctoTokenizer::setInputClass ( const std::string & cls )

set the tokenizer InputClass value

Parameters

cls	a string holding the inputclass. e.g. "OCR"

◆ setInputEncoding()

void UctoTokenizer::setInputEncoding ( const std::string & enc )

set the tokenizer InputEncoding value

Parameters

enc	a string holding a possible encoding. e.g. "WINDOWS-1252"

◆ setInputXml()

void UctoTokenizer::setInputXml ( bool b )

set the tokenizer InputXml property

Parameters

b	a boolean, true to set to ON or OFF respectively

◆ setOutputClass()

void UctoTokenizer::setOutputClass ( const std::string & cls )

set the tokenizer OutputClass value

Parameters

cls	a string holding the outputclass. e.g. "current"

◆ setPassThru()

void UctoTokenizer::setPassThru ( bool b )

set the tokenizer PassThru property

Parameters

b	a boolean, true to set to ON or OFF respectively

◆ setQuoteDetection()

void UctoTokenizer::setQuoteDetection ( bool b )

set the tokenizer QuoteDetection property

Parameters

b	a boolean, true to set to ON or OFF respectively

◆ setSentencePerLineInput()

void UctoTokenizer::setSentencePerLineInput ( bool b )

set the tokenizer SentencePerLine property

Parameters

b	a boolean, true to set to ON or OFF respectively

◆ setTextRedundancy()

void UctoTokenizer::setTextRedundancy ( const std::string & tr )

set the tokenizer TextRedundancy value (for FoLiA)

Parameters

tr	a string holding the value. Possible values are "none", "minimal" and "full"

◆ setUttMarker()

void UctoTokenizer::setUttMarker ( const std::string & u )

set the utterance marker for the tokenizer

Parameters

u	string holding the marker. e.g. "<utt>"

◆ setWordCorrection()

void UctoTokenizer::setWordCorrection ( bool b )

set the tokenizer WordCorrection property

Parameters

b	a boolean, true to set to ON or OFF respectively

◆ tokenize()

vector< string > UctoTokenizer::tokenize ( const std::string & line )

Tokenize a buffer of characters into a list of tokenized sentences

Parameters

line	of sequence of characters to be tokenized

Returns: a vector of strings each representing a sentence

The input line may be long and include newlines etc. Is is assumed to be in the current InputEncoding.

The output is sequence of tokenized strings in UTF8, each representing one sentence.

◆ tokenize_line()

vector< Tokenizer::Token > UctoTokenizer::tokenize_line	(	const std::string &	buffer,
		const std::string &	lang = `""`
	)

tokenize a buffer using a specific language

Parameters

buffer	a (possible long) sequence of characters
lang	the language to use for tokenizing

Returns: a list of Ucto::Token elements representing the first sentence

The buffer is consumed completely and stored as tokens in the Ucto Tokenizer

After calling tokenize_line() you should continue by calling tokenize_line_next() repeatedly to extract the next sentences

◆ tokenize_line_next()

vector< Tokenizer::Token > UctoTokenizer::tokenize_line_next ( )

extract the next sequence of Token elements

Returns: a list of Ucto::Token elements representing the next sentence

assumes the tokenizer is first set up using tokenize_line()

◆ tokenize_stream()

vector< Tokenizer::Token > UctoTokenizer::tokenize_stream ( std::istream & is )

restart the tokenizer on stream 'is' and calls tokenizer_stream_next() for the first results

Parameters

is	the stream to connect to

Returns: a list of Ucto::Token elements which can be examined further

After calling tokenize_stream() you should continue by calling tokenize_stream_next() until no more tokens ar found.

◆ tokenize_stream_next()

vector< Tokenizer::Token > UctoTokenizer::tokenize_stream_next ( )

Tokenize characters from the current input stream into a list of Ucto::Token

Returns: a list of Ucto::Token elements which can be examined further

This function will extract characters from stream and tokenize them.

This is non greedy. Might be called multiple times to consume the whole stream. It will return tokens upto an ENDOFSENTENCE token or out of data

◆ tokenizeStream()

string UctoTokenizer::tokenizeStream ( std::istream & is )

Tokenize characters from a stream into one tokenized sentences

Parameters

is	the input stream

Returns: a string representing a sentence, or "" when done.

This function will extract characters from stream and tokenize them into a sentence. Can be called repeatedly to get more sentences.

The Ucto tokenizer is keeping state of the input, so when calling this function again it is possible that NO actual data is read from the stream while a sentence is still in the tokenizer's buffer

The documentation for this class was generated from the following files:

include/frog/ucto_tokenizer_mod.h
src/ucto_tokenizer_mod.cxx

Public Member Functions

Constructor & Destructor Documentation

◆ UctoTokenizer()

◆ ~UctoTokenizer()

Member Function Documentation

◆ add_provenance()

◆ add_words()

◆ correct_words()

◆ default_language()

◆ get_data_version()

◆ get_setting_info()

◆ getPassThru()

◆ init()

◆ set_TC_debug()

◆ setDocID()

◆ setFiltering()

◆ setInputClass()

◆ setInputEncoding()

◆ setInputXml()

◆ setOutputClass()

◆ setPassThru()

◆ setQuoteDetection()

◆ setSentencePerLineInput()

◆ setTextRedundancy()

◆ setUttMarker()

◆ setWordCorrection()

◆ tokenize()

◆ tokenize_line()

◆ tokenize_line_next()

◆ tokenize_stream()

◆ tokenize_stream_next()

◆ tokenizeStream()