Skip to content

Commit c5c8851

Browse files
committed
refactoring, to avoid creating/deleting zillions of UnicodeNormalizers
1 parent 35dc402 commit c5c8851

19 files changed

Lines changed: 81 additions & 68 deletions

configure.ac

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,7 @@ LIBS="$ICU_LIBS $LIBS"
108108

109109
AX_LIB_READLINE
110110

111-
PKG_CHECK_MODULES([ticcutils], [ticcutils >= 0.36] )
111+
PKG_CHECK_MODULES([ticcutils], [ticcutils >= 0.38] )
112112

113113
CXXFLAGS="$CXXFLAGS $ticcutils_CFLAGS"
114114
LIBS="$ticcutils_LIBS $LIBS"

include/frog/Parser.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,7 @@ class ParserBase {
8282
std::string MWU_tagset;
8383
std::string textclass;
8484
TiCC::UniFilter *filter;
85+
TiCC::UnicodeNormalizer _normalizer;
8586
std::string _host;
8687
std::string _port;
8788
};
@@ -107,6 +108,8 @@ class Parser: public ParserBase {
107108
std::vector<icu::UnicodeString> createRelInstances( const parseData& );
108109
std::vector<timbl_result> timbl_server( const std::string&,
109110
const std::vector<icu::UnicodeString>& );
111+
std::vector<timbl_result> timbl( Timbl::TimblAPI *,
112+
const std::vector<UnicodeString>& );
110113
Parser( const Parser& ) = delete; // inhibit copies
111114
Parser operator=( const Parser& ) = delete; // inhibit copies
112115
std::string maxDepSpanS;

include/frog/clex.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ namespace CLEX {
6262
};
6363
bool is_CELEX_base( const Type& );
6464
Type select_tag( const char ch );
65-
std::string toString( const Type& );
65+
icu::UnicodeString toString( const Type& );
6666
icu::UnicodeString toUnicodeString( const Type& );
6767
Type toCLEX( const icu::UnicodeString& );
6868
Type toCLEX( const UChar );

include/frog/mblem_mod.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,7 @@ class Mblem {
9494
TiCC::LogStream *errLog;
9595
TiCC::LogStream *dbgLog;
9696
TiCC::UniFilter *filter;
97+
mutable TiCC::UnicodeNormalizer _normalizer;
9798
Mblem( const Mblem& ) = delete;
9899
Mblem& operator=( const Mblem& ) = delete;
99100
};

include/frog/mbma_mod.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,7 @@ class Mbma {
114114
TiCC::LogStream *errLog;
115115
TiCC::LogStream *dbgLog;
116116
TiCC::UniFilter *filter;
117+
mutable TiCC::UnicodeNormalizer _normalizer;
117118
std::string _host;
118119
std::string _port;
119120
std::string _base;

include/frog/ner_tagger_mod.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,7 @@ class NERTagger: public BaseTagger {
8181
std::vector<std::map<icu::UnicodeString,std::set<std::string>>>& );
8282
std::vector<icu::UnicodeString> create_ner_list( const std::vector<icu::UnicodeString>&,
8383
const std::vector<std::map<icu::UnicodeString,std::set<std::string>>>& );
84+
std::vector<UnicodeString> serialize( const std::vector<std::set<std::string>>& ) const;
8485
std::vector<std::map<icu::UnicodeString,std::set<std::string>>> gazet_ners;
8586
std::vector<std::map<icu::UnicodeString,std::set<std::string>>> override_ners;
8687
void addEntity( frog_data&,

include/frog/tagger_base.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,8 @@ class BaseTagger {
7070
const std::string& version() const { return _version; };
7171
private:
7272
std::vector<tag_entry> extract_sentence( const frog_data& );
73+
nlohmann::json create_json( const std::vector<tag_entry>& ) const;
74+
std::vector<Tagger::TagResult> json_to_TR( const nlohmann::json& in ) const;
7375
nlohmann::json read_from_client( Sockets::ClientSocket& ) const;
7476
void write_to_client( nlohmann::json&,
7577
Sockets::ClientSocket& ) const;
@@ -87,6 +89,7 @@ class BaseTagger {
8789
std::string _port;
8890
MbtAPI *tagger;
8991
TiCC::UniFilter *filter;
92+
mutable TiCC::UnicodeNormalizer _normalizer;
9093
std::vector<std::string> _words;
9194
std::vector<Tagger::TagResult> _tag_result;
9295
std::map<icu::UnicodeString,icu::UnicodeString> token_tag_map;

src/FrogData.cxx

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -68,10 +68,11 @@ json frog_record::to_json() const {
6868
\return an JSON structure
6969
*/
7070
json result;
71-
result["word"] = TiCC::UnicodeToUTF8(word);
71+
TiCC::UnicodeNormalizer UN;
72+
result["word"] = TiCC::UnicodeToUTF8(word,UN);
7273
if ( !token_class.isEmpty() ){
7374
json tok;
74-
tok["token"] = TiCC::UnicodeToUTF8(token_class);
75+
tok["token"] = TiCC::UnicodeToUTF8(token_class,UN);
7576
if ( no_space ){
7677
tok["space"] = false;
7778
}
@@ -81,29 +82,29 @@ json frog_record::to_json() const {
8182
result["ucto"] = tok;
8283
}
8384
if ( !lemmas.empty() ){
84-
result["lemma"] = TiCC::UnicodeToUTF8(lemmas[0]);
85+
result["lemma"] = TiCC::UnicodeToUTF8(lemmas[0],UN);
8586
}
8687
if ( !morph_string.isEmpty() ){
87-
result["morph"] = TiCC::UnicodeToUTF8(morph_string);
88+
result["morph"] = TiCC::UnicodeToUTF8(morph_string,UN);
8889
}
8990
if ( compound_string.find("0") == string::npos ){
9091
result["compound"] = compound_string;
9192
}
9293
if ( !tag.isEmpty() ){
9394
json tg;
94-
tg["tag"] = TiCC::UnicodeToUTF8(tag);
95+
tg["tag"] = TiCC::UnicodeToUTF8(tag,UN);
9596
tg["confidence"] = tag_confidence;
9697
result["pos"] = tg;
9798
}
9899
if ( !ner_tag.isEmpty() && ner_confidence > 0.0 ){
99100
json tg;
100-
tg["tag"] = TiCC::UnicodeToUTF8(ner_tag);
101+
tg["tag"] = TiCC::UnicodeToUTF8(ner_tag,UN);
101102
tg["confidence"] = ner_confidence;
102103
result["ner"] = tg;
103104
}
104105
if ( !iob_tag.isEmpty() ){
105106
json tg;
106-
tg["tag"] = TiCC::UnicodeToUTF8(iob_tag);
107+
tg["tag"] = TiCC::UnicodeToUTF8(iob_tag,UN);
107108
tg["confidence"] = iob_confidence;
108109
result["chunking"] = tg;
109110
}

src/Parser.cxx

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -984,8 +984,8 @@ parseData Parser::prepareParse( frog_data& fd ){ // |
984984
}
985985

986986

987-
vector<timbl_result> timbl( Timbl::TimblAPI* tim,
988-
const vector<UnicodeString>& instances ){
987+
vector<timbl_result> Parser::timbl( Timbl::TimblAPI* tim,
988+
const vector<UnicodeString>& instances ){
989989
/// call a Timbl experiment with a list of instances
990990
/*!
991991
\param tim The Timbl to use
@@ -997,7 +997,7 @@ vector<timbl_result> timbl( Timbl::TimblAPI* tim,
997997
for ( const auto& inst : instances ){
998998
const Timbl::ClassDistribution *db;
999999
const Timbl::TargetValue *tv = tim->Classify( inst, db );
1000-
result.push_back( timbl_result( TiCC::UnicodeToUTF8(tv->name()),
1000+
result.push_back( timbl_result( TiCC::UnicodeToUTF8(tv->name(),_normalizer),
10011001
db->Confidence(tv), *db ) );
10021002
}
10031003
return result;
@@ -1073,7 +1073,7 @@ vector<timbl_result> Parser::timbl_server( const string& base,
10731073
query["command"] = "classify";
10741074
json arr = json::array();
10751075
for ( const auto& inst : instances ){
1076-
arr.push_back( TiCC::UnicodeToUTF8(inst) );
1076+
arr.push_back( TiCC::UnicodeToUTF8(inst,_normalizer) );
10771077
}
10781078
query["params"] = arr;
10791079
DBG << "send json" << query.dump(2) << endl;

src/cgn_tagger_mod.cxx

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -300,7 +300,7 @@ void CGNTagger::add_tags( const vector<folia::Word*>& wv,
300300
for ( const auto& word : fd.units ){
301301
folia::KWargs u_args;
302302
u_args["set"] = getTagset();
303-
u_args["class"] = TiCC::UnicodeToUTF8(word.tag);
303+
u_args["class"] = TiCC::UnicodeToUTF8(word.tag,_normalizer);
304304
if ( textclass != "current" ){
305305
u_args["textclass"] = textclass;
306306
}
@@ -312,7 +312,7 @@ void CGNTagger::add_tags( const vector<folia::Word*>& wv,
312312
}
313313
vector<UnicodeString> hv = TiCC::split_at_first_of( word.tag, "()" );
314314
UnicodeString head = hv[0];
315-
u_args["class"] = TiCC::UnicodeToUTF8(head);
315+
u_args["class"] = TiCC::UnicodeToUTF8(head,_normalizer);
316316
#pragma omp critical (foliaupdate)
317317
{
318318
postag->add_child<folia::HeadFeature>( u_args );
@@ -325,10 +325,10 @@ void CGNTagger::add_tags( const vector<folia::Word*>& wv,
325325
for ( const auto& f : feats ){
326326
folia::KWargs f_args;
327327
f_args["set"] = getTagset();
328-
f_args["subset"] = getSubSet( TiCC::UnicodeToUTF8(f),
329-
TiCC::UnicodeToUTF8(head),
330-
TiCC::UnicodeToUTF8(word.tag) );
331-
f_args["class"] = TiCC::UnicodeToUTF8(f);
328+
f_args["subset"] = getSubSet( TiCC::UnicodeToUTF8(f,_normalizer),
329+
TiCC::UnicodeToUTF8(head,_normalizer),
330+
TiCC::UnicodeToUTF8(word.tag,_normalizer) );
331+
f_args["class"] = TiCC::UnicodeToUTF8(f,_normalizer);
332332
#pragma omp critical (foliaupdate)
333333
{
334334
postag->add_child<folia::Feature>( f_args );

0 commit comments

Comments
 (0)