Frog
ucto_tokenizer_mod.h
Go to the documentation of this file.
1 /*
2  Copyright (c) 2006 - 2020
3  CLST - Radboud University
4  ILK - Tilburg University
5 
6  This file is part of frog.
7 
8  frog is free software; you can redistribute it and/or modify
9  it under the terms of the GNU General Public License as published by
10  the Free Software Foundation; either version 3 of the License, or
11  (at your option) any later version.
12 
13  frog is distributed in the hope that it will be useful,
14  but WITHOUT ANY WARRANTY; without even the implied warranty of
15  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  GNU General Public License for more details.
17 
18  You should have received a copy of the GNU General Public License
19  along with this program. If not, see <http://www.gnu.org/licenses/>.
20 
21  For questions and suggestions, see:
22  https://github.com/LanguageMachines/timblserver/issues
23  or send mail to:
24  lamasoftware (at ) science.ru.nl
25 
26 */
27 
28 #ifndef UCTO_TOKENIZER_MOD_H
29 #define UCTO_TOKENIZER_MOD_H
30 
31 #include "libfolia/folia.h"
32 #include "ucto/tokenize.h"
33 #include "frog/FrogData.h"
34 #include "ticcutils/Configuration.h"
35 
37  public:
38  explicit UctoTokenizer( TiCC::LogStream *, TiCC::LogStream * =0 );
40  bool init( const TiCC::Configuration& );
41  void setUttMarker( const std::string& );
42  void setPassThru( bool );
43  void set_TC_debug( bool );
44  bool getPassThru() const;
45  void setSentencePerLineInput( bool );
46  void setInputEncoding( const std::string& );
47  void setQuoteDetection( bool );
48  void setInputXml( bool );
49  void setFiltering( bool );
50  void setInputClass( const std::string& );
51  void setOutputClass( const std::string& );
52  void setDocID( const std::string& );
53  void setTextRedundancy( const std::string& );
54  void setWordCorrection( bool );
55  std::string get_data_version() const;
56  std::string default_language() const;
57  bool get_setting_info( const std::string&, std::string&, std::string& ) const;
58  std::vector<std::string> tokenize( const std::string& );
59  std::vector<Tokenizer::Token> tokenize_line( const std::string&, const std::string& = "" );
60  std::vector<Tokenizer::Token> tokenize_line_next();
61  std::vector<Tokenizer::Token> tokenize_stream( std::istream& );
62  std::vector<Tokenizer::Token> tokenize_stream_next();
63  std::string tokenizeStream( std::istream& );
64  std::vector<folia::Word*> add_words( folia::Sentence *,
65  const frog_data& ) const;
66  void add_provenance( folia::Document& , folia::processor * ) const;
67  std::vector<Tokenizer::Token> correct_words( folia::FoliaElement *,
68  const std::vector<folia::Word*>& );
69  private:
70  std::istream *cur_is;
71  Tokenizer::TokenizerClass *tokenizer;
72  TiCC::LogStream *errLog;
73  TiCC::LogStream *dbgLog;
74  int debug;
75  std::string textredundancy;
76 };
77 
78 #endif
UctoTokenizer::tokenize_line_next
std::vector< Tokenizer::Token > tokenize_line_next()
Definition: ucto_tokenizer_mod.cxx:512
UctoTokenizer::add_provenance
void add_provenance(folia::Document &, folia::processor *) const
Definition: ucto_tokenizer_mod.cxx:375
frog_data
a datastructure to hold all frogged information of one Sentence
Definition: FrogData.h:76
UctoTokenizer::add_words
std::vector< folia::Word * > add_words(folia::Sentence *, const frog_data &) const
Definition: ucto_tokenizer_mod.cxx:584
UctoTokenizer::setTextRedundancy
void setTextRedundancy(const std::string &)
Definition: ucto_tokenizer_mod.cxx:325
UctoTokenizer::setInputXml
void setInputXml(bool)
Definition: ucto_tokenizer_mod.cxx:299
UctoTokenizer::setPassThru
void setPassThru(bool)
Definition: ucto_tokenizer_mod.cxx:352
UctoTokenizer
Definition: ucto_tokenizer_mod.h:36
UctoTokenizer::default_language
std::string default_language() const
Definition: ucto_tokenizer_mod.cxx:574
UctoTokenizer::setFiltering
void setFiltering(bool)
Definition: ucto_tokenizer_mod.cxx:312
UctoTokenizer::setWordCorrection
void setWordCorrection(bool)
Definition: ucto_tokenizer_mod.cxx:226
UctoTokenizer::~UctoTokenizer
~UctoTokenizer()
Definition: ucto_tokenizer_mod.cxx:71
UctoTokenizer::correct_words
std::vector< Tokenizer::Token > correct_words(folia::FoliaElement *, const std::vector< folia::Word * > &)
Definition: ucto_tokenizer_mod.cxx:658
UctoTokenizer::setSentencePerLineInput
void setSentencePerLineInput(bool)
Definition: ucto_tokenizer_mod.cxx:200
UctoTokenizer::set_TC_debug
void set_TC_debug(bool)
Definition: ucto_tokenizer_mod.cxx:339
UctoTokenizer::tokenize_stream
std::vector< Tokenizer::Token > tokenize_stream(std::istream &)
Definition: ucto_tokenizer_mod.cxx:469
UctoTokenizer::tokenizeStream
std::string tokenizeStream(std::istream &)
Definition: ucto_tokenizer_mod.cxx:427
UctoTokenizer::setDocID
void setDocID(const std::string &)
Definition: ucto_tokenizer_mod.cxx:284
UctoTokenizer::init
bool init(const TiCC::Configuration &)
Definition: ucto_tokenizer_mod.cxx:97
UctoTokenizer::setQuoteDetection
void setQuoteDetection(bool)
Definition: ucto_tokenizer_mod.cxx:213
UctoTokenizer::setUttMarker
void setUttMarker(const std::string &)
Definition: ucto_tokenizer_mod.cxx:185
UctoTokenizer::setInputClass
void setInputClass(const std::string &)
Definition: ucto_tokenizer_mod.cxx:254
UctoTokenizer::UctoTokenizer
UctoTokenizer(TiCC::LogStream *, TiCC::LogStream *=0)
Definition: ucto_tokenizer_mod.cxx:50
UctoTokenizer::tokenize_line
std::vector< Tokenizer::Token > tokenize_line(const std::string &, const std::string &="")
Definition: ucto_tokenizer_mod.cxx:489
UctoTokenizer::getPassThru
bool getPassThru() const
Definition: ucto_tokenizer_mod.cxx:365
UctoTokenizer::get_data_version
std::string get_data_version() const
Definition: ucto_tokenizer_mod.cxx:527
UctoTokenizer::tokenize_stream_next
std::vector< Tokenizer::Token > tokenize_stream_next()
Definition: ucto_tokenizer_mod.cxx:450
UctoTokenizer::tokenize
std::vector< std::string > tokenize(const std::string &)
Definition: ucto_tokenizer_mod.cxx:405
FrogData.h
UctoTokenizer::setOutputClass
void setOutputClass(const std::string &)
Definition: ucto_tokenizer_mod.cxx:269
UctoTokenizer::get_setting_info
bool get_setting_info(const std::string &, std::string &, std::string &) const
Definition: ucto_tokenizer_mod.cxx:537
UctoTokenizer::setInputEncoding
void setInputEncoding(const std::string &)
Definition: ucto_tokenizer_mod.cxx:239