#include <ucto_tokenizer_mod.h>
◆ UctoTokenizer()
| UctoTokenizer::UctoTokenizer |
( |
TiCC::LogStream * |
err_log, |
|
|
TiCC::LogStream * |
dbg_log = 0 |
|
) |
| |
|
explicit |
Created a (yet UNINITIALIZED) Tokenizer
- Parameters
-
| err_log | A LogStream for error messages |
| dbg_log | A LogStream for debugging |
UctoTokenizer::init() needs to be called to get really going
◆ ~UctoTokenizer()
| UctoTokenizer::~UctoTokenizer |
( |
| ) |
|
◆ add_provenance()
| void UctoTokenizer::add_provenance |
( |
folia::Document & |
doc, |
|
|
folia::processor * |
main |
|
) |
| const |
add provenance information for the tokenizer. (FoLiA output only)
- Parameters
-
| doc | the FoLiA document to add to |
| main | the processor to use (presumably the Frog processor) |
◆ add_words()
| vector< folia::Word * > UctoTokenizer::add_words |
( |
folia::Sentence * |
s, |
|
|
const frog_data & |
fd |
|
) |
| const |
create a list of folia::Word elements under a folia::Sentence
- Parameters
-
| s | The parent to attach too |
| fd | The frog_data structure with the needed information |
- Returns
- a list of newly created folia::Word elements
this function should be used when creating FoLiA output, and it assumes that the tokenizer allready filled in all required fields in the frog_data structure
◆ correct_words()
| vector< Tokenizer::Token > UctoTokenizer::correct_words |
( |
folia::FoliaElement * |
elt, |
|
|
const std::vector< folia::Word * > & |
wv |
|
) |
| |
correct Word elements in the FoLiA based on results found by the tokenizer
- Parameters
-
| elt | the FoliaElement which is the parent for correction |
| wv | The input Word vector, (of which elt is the parent) |
the input Word vector might represent a 'word' like "gisteren?". The tokenizer will split this into "gisteren" and "?" and this function will handle this by creating a correction with 2 words as <new>
◆ default_language()
| string UctoTokenizer::default_language |
( |
| ) |
const |
return the default language of the tokenizer
◆ get_data_version()
| string UctoTokenizer::get_data_version |
( |
| ) |
const |
returns the version of the uctodata files we use
◆ get_setting_info()
| bool UctoTokenizer::get_setting_info |
( |
const std::string & |
lang, |
|
|
std::string & |
name, |
|
|
std::string & |
version |
|
) |
| const |
get information about the current settings for a language
- Parameters
-
| lang | The language to examine |
| name | The name of the settings file used for lang |
| version | The version of the settingsfile |
◆ getPassThru()
| bool UctoTokenizer::getPassThru |
( |
| ) |
const |
get the value of the PassThru setting
◆ init()
| bool UctoTokenizer::init |
( |
const TiCC::Configuration & |
config | ) |
|
initalize a Tokenizer using a Configuration structure
- Parameters
-
| config | the Configuration to use |
- Returns
- true on success, false otherwise
this function sets up an Ucto tokenizer with some defaults and the values from config.
◆ set_TC_debug()
| void UctoTokenizer::set_TC_debug |
( |
bool |
b | ) |
|
set the tokenizer TC_debug property
- Parameters
-
| b | a boolean, true to set to ON or OFF respectively |
◆ setDocID()
| void UctoTokenizer::setDocID |
( |
const std::string & |
id | ) |
|
set the tokenizer DocID value (for FoLiA)
- Parameters
-
| id | a string holding the document id. e.g. "document_1" |
◆ setFiltering()
| void UctoTokenizer::setFiltering |
( |
bool |
b | ) |
|
set the tokenizer Filtering property
- Parameters
-
| b | a boolean, true to set to ON or OFF respectively |
◆ setInputClass()
| void UctoTokenizer::setInputClass |
( |
const std::string & |
cls | ) |
|
set the tokenizer InputClass value
- Parameters
-
| cls | a string holding the inputclass. e.g. "OCR" |
◆ setInputEncoding()
| void UctoTokenizer::setInputEncoding |
( |
const std::string & |
enc | ) |
|
set the tokenizer InputEncoding value
- Parameters
-
| enc | a string holding a possible encoding. e.g. "WINDOWS-1252" |
◆ setInputXml()
| void UctoTokenizer::setInputXml |
( |
bool |
b | ) |
|
set the tokenizer InputXml property
- Parameters
-
| b | a boolean, true to set to ON or OFF respectively |
◆ setOutputClass()
| void UctoTokenizer::setOutputClass |
( |
const std::string & |
cls | ) |
|
set the tokenizer OutputClass value
- Parameters
-
| cls | a string holding the outputclass. e.g. "current" |
◆ setPassThru()
| void UctoTokenizer::setPassThru |
( |
bool |
b | ) |
|
set the tokenizer PassThru property
- Parameters
-
| b | a boolean, true to set to ON or OFF respectively |
◆ setQuoteDetection()
| void UctoTokenizer::setQuoteDetection |
( |
bool |
b | ) |
|
set the tokenizer QuoteDetection property
- Parameters
-
| b | a boolean, true to set to ON or OFF respectively |
◆ setSentencePerLineInput()
| void UctoTokenizer::setSentencePerLineInput |
( |
bool |
b | ) |
|
set the tokenizer SentencePerLine property
- Parameters
-
| b | a boolean, true to set to ON or OFF respectively |
◆ setTextRedundancy()
| void UctoTokenizer::setTextRedundancy |
( |
const std::string & |
tr | ) |
|
set the tokenizer TextRedundancy value (for FoLiA)
- Parameters
-
| tr | a string holding the value. Possible values are "none", "minimal" and "full" |
◆ setUttMarker()
| void UctoTokenizer::setUttMarker |
( |
const std::string & |
u | ) |
|
set the utterance marker for the tokenizer
- Parameters
-
| u | string holding the marker. e.g. "<utt>" |
◆ setWordCorrection()
| void UctoTokenizer::setWordCorrection |
( |
bool |
b | ) |
|
set the tokenizer WordCorrection property
- Parameters
-
| b | a boolean, true to set to ON or OFF respectively |
◆ tokenize()
| vector< string > UctoTokenizer::tokenize |
( |
const std::string & |
line | ) |
|
Tokenize a buffer of characters into a list of tokenized sentences
- Parameters
-
| line | of sequence of characters to be tokenized |
- Returns
- a vector of strings each representing a sentence
The input line may be long and include newlines etc. Is is assumed to be in the current InputEncoding.
The output is sequence of tokenized strings in UTF8, each representing one sentence.
◆ tokenize_line()
| vector< Tokenizer::Token > UctoTokenizer::tokenize_line |
( |
const std::string & |
buffer, |
|
|
const std::string & |
lang = "" |
|
) |
| |
tokenize a buffer using a specific language
- Parameters
-
| buffer | a (possible long) sequence of characters |
| lang | the language to use for tokenizing |
- Returns
- a list of Ucto::Token elements representing the first sentence
The buffer is consumed completely and stored as tokens in the Ucto Tokenizer
After calling tokenize_line() you should continue by calling tokenize_line_next() repeatedly to extract the next sentences
◆ tokenize_line_next()
| vector< Tokenizer::Token > UctoTokenizer::tokenize_line_next |
( |
| ) |
|
extract the next sequence of Token elements
- Returns
- a list of Ucto::Token elements representing the next sentence
assumes the tokenizer is first set up using tokenize_line()
◆ tokenize_stream()
| vector< Tokenizer::Token > UctoTokenizer::tokenize_stream |
( |
std::istream & |
is | ) |
|
restart the tokenizer on stream 'is' and calls tokenizer_stream_next() for the first results
- Parameters
-
| is | the stream to connect to |
- Returns
- a list of Ucto::Token elements which can be examined further
After calling tokenize_stream() you should continue by calling tokenize_stream_next() until no more tokens ar found.
◆ tokenize_stream_next()
| vector< Tokenizer::Token > UctoTokenizer::tokenize_stream_next |
( |
| ) |
|
Tokenize characters from the current input stream into a list of Ucto::Token
- Returns
- a list of Ucto::Token elements which can be examined further
This function will extract characters from stream and tokenize them.
This is non greedy. Might be called multiple times to consume the whole stream. It will return tokens upto an ENDOFSENTENCE token or out of data
◆ tokenizeStream()
| string UctoTokenizer::tokenizeStream |
( |
std::istream & |
is | ) |
|
Tokenize characters from a stream into one tokenized sentences
- Parameters
-
- Returns
- a string representing a sentence, or "" when done.
This function will extract characters from stream and tokenize them into a sentence. Can be called repeatedly to get more sentences.
The Ucto tokenizer is keeping state of the input, so when calling this function again it is possible that NO actual data is read from the stream while a sentence is still in the tokenizer's buffer
The documentation for this class was generated from the following files: