Skip to content

Commit 2fa6cb2

Browse files
committed
safe work
1 parent c5c8851 commit 2fa6cb2

9 files changed

Lines changed: 79 additions & 55 deletions

File tree

include/frog/csidp.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,8 @@ class timbl_result {
4949
public:
5050
timbl_result( const std::string&,
5151
double,
52-
const Timbl::ClassDistribution& );
52+
const Timbl::ClassDistribution&,
53+
TiCC::UnicodeNormalizer& );
5354
timbl_result( const std::string&,
5455
double,
5556
const std::vector<std::pair<std::string,double>>& );

include/frog/mbma_brackets.h

Lines changed: 20 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -34,8 +34,8 @@
3434

3535
#include <vector>
3636
#include <list>
37-
#include "unicode/unistr.h"
3837
#include "ticcutils/LogStream.h"
38+
#include "ticcutils/Unicode.h"
3939
#include "frog/clex.h"
4040

4141
/// The state of the MBMA structure
@@ -75,20 +75,23 @@ class RulePart;
7575
/// \brief a base class for storing bracketted MBMA rules
7676
class BaseBracket {
7777
public:
78-
BaseBracket( CLEX::Type t, const std::vector<CLEX::Type>& R, int flag,
79-
TiCC::LogStream& l ):
80-
RightHand(R),
78+
BaseBracket( CLEX::Type t, const std::vector<CLEX::Type>& R, int flag,
79+
TiCC::LogStream& l, TiCC::UnicodeNormalizer& norm ):
80+
RightHand(R),
8181
cls(t),
8282
_status( FAILED ),
8383
debugFlag(flag),
84-
myLog(l)
85-
{};
86-
BaseBracket( CLEX::Type t, int flag, TiCC::LogStream& l ):
87-
cls(t),
84+
myLog(l),
85+
_normalizer(norm)
86+
{};
87+
BaseBracket( CLEX::Type t, int flag,
88+
TiCC::LogStream& l, TiCC::UnicodeNormalizer & norm ):
89+
cls(t),
8890
_status( FAILED ),
8991
debugFlag(flag),
90-
myLog(l)
91-
{};
92+
myLog(l),
93+
_normalizer(norm)
94+
{};
9295
virtual ~BaseBracket() {};
9396
Status status() const { return _status; };
9497
void set_status( const Status s ) { _status = s; };
@@ -121,14 +124,17 @@ class BaseBracket {
121124
Status _status;
122125
int debugFlag;
123126
TiCC::LogStream& myLog;
127+
TiCC::UnicodeNormalizer& _normalizer;
124128
};
125129

126130
/// \brief a specialization of BaseBracket to store endnodes (morphemes and
127131
/// inflection information
128132
class BracketLeaf: public BaseBracket {
129133
public:
130-
BracketLeaf( const RulePart&, int, TiCC::LogStream& );
131-
BracketLeaf( CLEX::Type, const icu::UnicodeString&, int, TiCC::LogStream& );
134+
BracketLeaf( const RulePart&, int,
135+
TiCC::LogStream&, TiCC::UnicodeNormalizer& );
136+
BracketLeaf( CLEX::Type, const icu::UnicodeString&, int,
137+
TiCC::LogStream&, TiCC::UnicodeNormalizer& );
132138
~BracketLeaf() override;
133139
icu::UnicodeString put( bool = false ) const override;
134140
icu::UnicodeString morpheme() const override {
@@ -170,7 +176,8 @@ class BracketLeaf: public BaseBracket {
170176
/// provides functions to test and resolve rules
171177
class BracketNest: public BaseBracket {
172178
public:
173-
BracketNest( CLEX::Type, Compound::Type, int, TiCC::LogStream& );
179+
BracketNest( CLEX::Type, Compound::Type, int,
180+
TiCC::LogStream&, TiCC::UnicodeNormalizer& );
174181
BaseBracket *append( BaseBracket * ) override ;
175182
~BracketNest() override;
176183
bool isNested() const override { return true; };

include/frog/mbma_mod.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,7 @@ class Mbma {
123123
bool doDeepMorph;
124124
};
125125

126-
icu::UnicodeString flatten( const icu::UnicodeString& in );
126+
icu::UnicodeString flatten( const icu::UnicodeString&,
127+
TiCC::UnicodeNormalizer& );
127128

128129
#endif

include/frog/mbma_rule.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
#include <string>
3838
#include "unicode/unistr.h"
3939
#include "ticcutils/LogStream.h"
40+
#include "ticcutils/Unicode.h"
4041
#include "frog/clex.h"
4142

4243
namespace Compound {
@@ -74,7 +75,8 @@ class Rule {
7475
const icu::UnicodeString&,
7576
TiCC::LogStream&,
7677
TiCC::LogStream&,
77-
int );
78+
int,
79+
TiCC::UnicodeNormalizer& );
7880
~Rule();
7981
std::vector<icu::UnicodeString> extract_morphemes() const;
8082
icu::UnicodeString pretty_string( bool ) const;
@@ -103,6 +105,7 @@ class Rule {
103105
double confidence;
104106
size_t ID;
105107
private:
108+
TiCC::UnicodeNormalizer& _normalizer;
106109
icu::UnicodeString sort_key;
107110
};
108111

src/Parser.cxx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -998,7 +998,7 @@ vector<timbl_result> Parser::timbl( Timbl::TimblAPI* tim,
998998
const Timbl::ClassDistribution *db;
999999
const Timbl::TargetValue *tv = tim->Classify( inst, db );
10001000
result.push_back( timbl_result( TiCC::UnicodeToUTF8(tv->name(),_normalizer),
1001-
db->Confidence(tv), *db ) );
1001+
db->Confidence(tv), *db, _normalizer ) );
10021002
}
10031003
return result;
10041004
}

src/csidp.cxx

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -164,13 +164,14 @@ vector<const Constraint*> formulateWCSP( const vector<timbl_result>& d_res,
164164

165165
timbl_result::timbl_result( const string& cls,
166166
double conf,
167-
const Timbl::ClassDistribution& vd ):
167+
const Timbl::ClassDistribution& vd,
168+
TiCC::UnicodeNormalizer& norm ):
168169
_cls(cls),
169170
_confidence(conf)
170171
{
171172
for ( const auto& [dummy,val] : vd ){
172-
_dist.push_back( make_pair( TiCC::UnicodeToUTF8(val->Value()->name()),
173-
val->Weight()) );
173+
_dist.push_back( make_pair(TiCC::UnicodeToUTF8(val->Value()->name(), norm),
174+
val->Weight()));
174175
}
175176
}
176177

src/mbma_brackets.cxx

Lines changed: 25 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -242,8 +242,9 @@ ostream& operator<<( ostream& os, const Status& st ){
242242

243243
BracketLeaf::BracketLeaf( const RulePart& p,
244244
int debug_flag,
245-
TiCC::LogStream& l ):
246-
BaseBracket(p.ResultClass, p.RightHand, debug_flag, l ),
245+
TiCC::LogStream& l,
246+
TiCC::UnicodeNormalizer& norm ):
247+
BaseBracket(p.ResultClass, p.RightHand, debug_flag, l, norm ),
247248
_glue(false),
248249
_morph(p.morpheme )
249250
{
@@ -300,8 +301,9 @@ BracketLeaf::BracketLeaf( const RulePart& p,
300301
BracketLeaf::BracketLeaf( CLEX::Type t,
301302
const icu::UnicodeString& morpheme,
302303
int debug_flag,
303-
TiCC::LogStream& l ):
304-
BaseBracket( t, vector<CLEX::Type>(), debug_flag, l ),
304+
TiCC::LogStream& l,
305+
TiCC::UnicodeNormalizer& norm ):
306+
BaseBracket( t, vector<CLEX::Type>(), debug_flag, l, norm ),
305307
_glue(false),
306308
_orig( toUnicodeString( t ) ),
307309
_morph( morpheme )
@@ -320,8 +322,9 @@ BracketLeaf::BracketLeaf( CLEX::Type t,
320322
BracketNest::BracketNest( CLEX::Type t,
321323
Compound::Type c,
322324
int debug_flag,
323-
TiCC::LogStream& l ):
324-
BaseBracket( t, debug_flag, l ),
325+
TiCC::LogStream& l,
326+
TiCC::UnicodeNormalizer& norm ):
327+
BaseBracket( t, debug_flag, l, norm ),
325328
_compound( c )
326329
{
327330
/// create a BracketNest object from a CLEX::Type and a CompoundType
@@ -799,7 +802,6 @@ folia::Morpheme *BracketLeaf::createMorpheme( folia::Document *doc,
799802
*/
800803
folia::Morpheme *result = 0;
801804
desc.remove();
802-
TiCC::UnicodeNormalizer UN;
803805
int pos = _orig.indexOf( "^" );
804806
bool glue = ( pos != -1 );
805807
string m_class = toString( _status );
@@ -862,22 +864,22 @@ folia::Morpheme *BracketLeaf::createMorpheme( folia::Document *doc,
862864
args["set"] = Mbma::clex_tagset;
863865
if ( glue ){
864866
UnicodeString next_tag = _orig[pos+1];
865-
args["class"] = TiCC::UnicodeToUTF8(next_tag,UN);
867+
args["class"] = TiCC::UnicodeToUTF8(next_tag,_normalizer);
866868
desc = "[" + _morph + "]" + CLEX::get_tag_descr( CLEX::toCLEX(next_tag) );
867869
// spread the word upwards!
868870
}
869871
else {
870-
args["class"] = TiCC::UnicodeToUTF8(toString(tag()), UN );
872+
args["class"] = TiCC::UnicodeToUTF8(toString(tag()), _normalizer );
871873
desc = "[" + _morph + "]" + CLEX::get_tag_descr( tag() );
872874
// spread the word upwards!
873875
folia::KWargs fargs;
874876
fargs["subset"] = "structure";
875877
if ( tag() == CLEX::SPEC
876878
|| tag() == CLEX::LET ){
877-
fargs["class"] = TiCC::UnicodeToUTF8("[" + _morph + "]",UN);
879+
fargs["class"] = TiCC::UnicodeToUTF8("[" + _morph + "]",_normalizer);
878880
}
879881
else {
880-
fargs["class"] = TiCC::UnicodeToUTF8(desc,UN);
882+
fargs["class"] = TiCC::UnicodeToUTF8(desc,_normalizer);
881883
}
882884
#pragma omp critical (foliaupdate)
883885
{
@@ -892,7 +894,7 @@ folia::Morpheme *BracketLeaf::createMorpheme( folia::Document *doc,
892894
else if ( _status == Status::PARTICLE ){
893895
folia::KWargs args;
894896
args["set"] = Mbma::clex_tagset;
895-
args["class"] = TiCC::UnicodeToUTF8(toString( tag() ), UN );
897+
args["class"] = TiCC::UnicodeToUTF8(toString( tag() ), _normalizer );
896898
#pragma omp critical (foliaupdate)
897899
{
898900
result->addPosAnnotation( args );
@@ -917,7 +919,7 @@ folia::Morpheme *BracketLeaf::createMorpheme( folia::Document *doc,
917919
UnicodeString d = CLEX::get_inflect_descr(inf);
918920
if ( !d.isEmpty() ){
919921
// happens sometimes when there is fawlty data
920-
args["class"] = TiCC::UnicodeToUTF8(d,UN);
922+
args["class"] = TiCC::UnicodeToUTF8(d,_normalizer);
921923
desc += "/" + d;
922924
#pragma omp critical (foliaupdate)
923925
{
@@ -946,7 +948,7 @@ folia::Morpheme *BracketLeaf::createMorpheme( folia::Document *doc,
946948
// now we add the description as a feature
947949
folia::KWargs args;
948950
args["subset"] = "structure";
949-
args["class"] = TiCC::UnicodeToUTF8(desc,UN);
951+
args["class"] = TiCC::UnicodeToUTF8(desc,_normalizer);
950952
#pragma omp critical (foliaupdate)
951953
{
952954
result->add_child<folia::Feature>( args );
@@ -979,7 +981,6 @@ folia::Morpheme *BracketNest::createMorpheme( folia::Document *doc,
979981
\param cnt a counter for the number of handled morphemes
980982
*/
981983
folia::Morpheme *result = 0;
982-
TiCC::UnicodeNormalizer UN;
983984
folia::KWargs args;
984985
args["class"] = "complex";
985986
args["set"] = Mbma::mbma_tagset;
@@ -1002,7 +1003,7 @@ folia::Morpheme *BracketNest::createMorpheme( folia::Document *doc,
10021003
if ( !it->original().isEmpty() ){
10031004
args.clear();
10041005
args["subset"] = "applied_rule";
1005-
args["class"] = TiCC::UnicodeToUTF8(it->original(),UN);
1006+
args["class"] = TiCC::UnicodeToUTF8(it->original(),_normalizer);
10061007
#pragma omp critical (foliaupdate)
10071008
{
10081009
result->add_child<folia::Feature>( args );
@@ -1024,14 +1025,14 @@ folia::Morpheme *BracketNest::createMorpheme( folia::Document *doc,
10241025
if ( desc.isEmpty() ){
10251026
desc = "XYZ";
10261027
}
1027-
args["class"] = TiCC::UnicodeToUTF8(desc,UN);
1028+
args["class"] = TiCC::UnicodeToUTF8(desc,_normalizer);
10281029
#pragma omp critical (foliaupdate)
10291030
{
10301031
result->add_child<folia::Feature>( args );
10311032
}
10321033
args.clear();
10331034
args["set"] = Mbma::clex_tagset;
1034-
args["class"] = TiCC::UnicodeToUTF8(toString( tag() ), UN);
1035+
args["class"] = TiCC::UnicodeToUTF8(toString( tag() ), _normalizer );
10351036
folia::PosAnnotation *pos = 0;
10361037
#pragma omp critical (foliaupdate)
10371038
{
@@ -1098,7 +1099,8 @@ list<BaseBracket*>::const_iterator BracketNest::resolveAffix( list<BaseBracket*>
10981099
BracketNest *tmp = new BracketNest( (*rpos)->tag(),
10991100
Compound::Type::NONE,
11001101
debugFlag,
1101-
myLog );
1102+
myLog,
1103+
_normalizer );
11021104
for ( size_t j = 0; j < len; ++j ){
11031105
tmp->append( *it );
11041106
if ( debugFlag > 5 ){
@@ -1137,7 +1139,8 @@ void BracketNest::resolveNouns( ){
11371139
if ( (*prev)->compound() == Compound::Type::NN ){
11381140
newt = Compound::Type::NNN;
11391141
}
1140-
BaseBracket *tmp = new BracketNest( CLEX::N, newt, debugFlag, myLog );
1142+
BaseBracket *tmp = new BracketNest( CLEX::N, newt, debugFlag,
1143+
myLog, _normalizer );
11411144
tmp->append( *prev );
11421145
tmp->append( *it );
11431146
if ( debugFlag > 5 ){
@@ -1213,7 +1216,8 @@ list<BaseBracket*>::iterator BracketNest::glue( list<BaseBracket*>& result,
12131216
}
12141217
list<BaseBracket*>::iterator it = bit--;
12151218
BracketNest *tmp
1216-
= new BracketNest( (*rpos)->tag(), Compound::Type::NONE, debugFlag, myLog );
1219+
= new BracketNest( (*rpos)->tag(), Compound::Type::NONE,
1220+
debugFlag, myLog, _normalizer );
12171221
for ( size_t j = 0; j < len-1; ++j ){
12181222
tmp->append( *it );
12191223
if ( debugFlag > 5 ){

src/mbma_mod.cxx

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -423,7 +423,7 @@ Rule* Mbma::matchRule( const std::vector<icu::UnicodeString>& ana,
423423
second person variants.
424424
\return a matched Rule or 0
425425
*/
426-
Rule *rule = new Rule( ana, word, *errLog, *dbgLog, debugFlag );
426+
Rule *rule = new Rule( ana, word, *errLog, *dbgLog, debugFlag, _normalizer );
427427
if ( rule->performEdits() ){
428428
rule->reduceZeroNodes();
429429
if ( debugFlag > 1 ){
@@ -883,7 +883,8 @@ void Mbma::store_brackets( frog_record& fd,
883883
BaseBracket *leaf = new BracketLeaf( clex_tag,
884884
wrd,
885885
debugFlag,
886-
*dbgLog );
886+
*dbgLog,
887+
_normalizer );
887888
if ( fd.morph_string.isEmpty() ){
888889
fd.morph_string = "[" + wrd + "]";
889890
if ( doDeepMorph ){
@@ -897,7 +898,8 @@ void Mbma::store_brackets( frog_record& fd,
897898
BaseBracket *leaf = new BracketLeaf( CLEX::toCLEX(head),
898899
wrd,
899900
debugFlag,
900-
*dbgLog );
901+
*dbgLog,
902+
_normalizer );
901903
leaf->set_status( STEM );
902904
if ( fd.morph_string.isEmpty() ){
903905
fd.morph_string = "[" + wrd + "]";
@@ -909,7 +911,8 @@ void Mbma::store_brackets( frog_record& fd,
909911
BaseBracket *leaf = new BracketLeaf( CLEX::toCLEX(head),
910912
wrd,
911913
debugFlag,
912-
*dbgLog );
914+
*dbgLog,
915+
_normalizer );
913916
leaf->set_status( STEM );
914917
if ( fd.morph_string.isEmpty() ){
915918
fd.morph_string = "[" + wrd + "]";
@@ -936,14 +939,15 @@ void Mbma::store_brackets( frog_record& fd,
936939
return;
937940
}
938941

939-
UnicodeString flatten( const UnicodeString& in ) {
942+
UnicodeString flatten( const UnicodeString& in,
943+
TiCC::UnicodeNormalizer& norm ) {
940944
/// helper function to 'flatten out' bracketed morpheme strings
941945
/*!
942946
\param in a bracketed string of morphemes
943947
\return a string with multiple '[' and ']' reduced to single occurrences
944948
*/
945-
TiCC::UnicodeNormalizer UN;
946-
string s = TiCC::UnicodeToUTF8( in, UN );
949+
950+
string s = TiCC::UnicodeToUTF8( in, norm );
947951
string::size_type bpos = s.find_first_not_of( " [" );
948952
// deb << " FLATTEN: '" << s << "'" << endl;
949953
string result;
@@ -968,7 +972,7 @@ UnicodeString flatten( const UnicodeString& in ) {
968972
result = s;
969973
}
970974
// deb << "FLATTENED: '" << result << "'" << endl;
971-
return TiCC::UnicodeFromUTF8(result,UN);
975+
return TiCC::UnicodeFromUTF8(result,norm);
972976
}
973977

974978
void Mbma::storeResult( frog_record& fd,
@@ -991,7 +995,7 @@ void Mbma::storeResult( frog_record& fd,
991995
fd.morph_string = pv[0].first;
992996
}
993997
else {
994-
fd.morph_string = flatten( pv[0].first );
998+
fd.morph_string = flatten( pv[0].first, _normalizer );
995999
}
9961000
if ( pv[0].second == "none" ){
9971001
fd.compound_string = "0";

0 commit comments

Comments
 (0)