Go to the documentation of this file.
28 #ifndef UCTO_TOKENIZER_MOD_H
29 #define UCTO_TOKENIZER_MOD_H
31 #include "libfolia/folia.h"
32 #include "ucto/tokenize.h"
34 #include "ticcutils/Configuration.h"
38 explicit UctoTokenizer( TiCC::LogStream *, TiCC::LogStream * =0 );
40 bool init(
const TiCC::Configuration& );
57 bool get_setting_info(
const std::string&, std::string&, std::string& )
const;
58 std::vector<std::string>
tokenize(
const std::string& );
59 std::vector<Tokenizer::Token>
tokenize_line(
const std::string&,
const std::string& =
"" );
64 std::vector<folia::Word*>
add_words( folia::Sentence *,
66 void add_provenance( folia::Document& , folia::processor * )
const;
67 std::vector<Tokenizer::Token>
correct_words( folia::FoliaElement *,
68 const std::vector<folia::Word*>& );
71 Tokenizer::TokenizerClass *tokenizer;
72 TiCC::LogStream *errLog;
73 TiCC::LogStream *dbgLog;
75 std::string textredundancy;
std::vector< Tokenizer::Token > tokenize_line_next()
Definition: ucto_tokenizer_mod.cxx:512
void add_provenance(folia::Document &, folia::processor *) const
Definition: ucto_tokenizer_mod.cxx:375
a datastructure to hold all frogged information of one Sentence
Definition: FrogData.h:76
std::vector< folia::Word * > add_words(folia::Sentence *, const frog_data &) const
Definition: ucto_tokenizer_mod.cxx:584
void setTextRedundancy(const std::string &)
Definition: ucto_tokenizer_mod.cxx:325
void setInputXml(bool)
Definition: ucto_tokenizer_mod.cxx:299
void setPassThru(bool)
Definition: ucto_tokenizer_mod.cxx:352
Definition: ucto_tokenizer_mod.h:36
std::string default_language() const
Definition: ucto_tokenizer_mod.cxx:574
void setFiltering(bool)
Definition: ucto_tokenizer_mod.cxx:312
void setWordCorrection(bool)
Definition: ucto_tokenizer_mod.cxx:226
~UctoTokenizer()
Definition: ucto_tokenizer_mod.cxx:71
std::vector< Tokenizer::Token > correct_words(folia::FoliaElement *, const std::vector< folia::Word * > &)
Definition: ucto_tokenizer_mod.cxx:658
void setSentencePerLineInput(bool)
Definition: ucto_tokenizer_mod.cxx:200
void set_TC_debug(bool)
Definition: ucto_tokenizer_mod.cxx:339
std::vector< Tokenizer::Token > tokenize_stream(std::istream &)
Definition: ucto_tokenizer_mod.cxx:469
std::string tokenizeStream(std::istream &)
Definition: ucto_tokenizer_mod.cxx:427
void setDocID(const std::string &)
Definition: ucto_tokenizer_mod.cxx:284
bool init(const TiCC::Configuration &)
Definition: ucto_tokenizer_mod.cxx:97
void setQuoteDetection(bool)
Definition: ucto_tokenizer_mod.cxx:213
void setUttMarker(const std::string &)
Definition: ucto_tokenizer_mod.cxx:185
void setInputClass(const std::string &)
Definition: ucto_tokenizer_mod.cxx:254
UctoTokenizer(TiCC::LogStream *, TiCC::LogStream *=0)
Definition: ucto_tokenizer_mod.cxx:50
std::vector< Tokenizer::Token > tokenize_line(const std::string &, const std::string &="")
Definition: ucto_tokenizer_mod.cxx:489
bool getPassThru() const
Definition: ucto_tokenizer_mod.cxx:365
std::string get_data_version() const
Definition: ucto_tokenizer_mod.cxx:527
std::vector< Tokenizer::Token > tokenize_stream_next()
Definition: ucto_tokenizer_mod.cxx:450
std::vector< std::string > tokenize(const std::string &)
Definition: ucto_tokenizer_mod.cxx:405
void setOutputClass(const std::string &)
Definition: ucto_tokenizer_mod.cxx:269
bool get_setting_info(const std::string &, std::string &, std::string &) const
Definition: ucto_tokenizer_mod.cxx:537
void setInputEncoding(const std::string &)
Definition: ucto_tokenizer_mod.cxx:239