-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathIndexedDocument.cs
More file actions
429 lines (361 loc) · 14.2 KB
/
IndexedDocument.cs
File metadata and controls
429 lines (361 loc) · 14.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
using MongoDB.Bson;
using MongoDB.Bson.Serialization.Attributes;
using MongoDB.Driver;
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text.RegularExpressions;
using System.Threading.Tasks;
namespace MongoLucene
{
// Document model for storing indexed content
public class IndexedDocument
{
[BsonId]
public ObjectId Id { get; set; }
public string DocumentId { get; set; }
public Dictionary<string, object> Fields { get; set; } = new Dictionary<string, object>();
public Dictionary<string, List<string>> Tokens { get; set; } = new Dictionary<string, List<string>>();
public DateTime IndexedAt { get; set; } = DateTime.UtcNow;
public float Boost { get; set; } = 1.0f;
}
// Field definition for document structure
public class Field
{
public string Name { get; set; }
public object Value { get; set; }
public FieldType Type { get; set; }
public bool IsStored { get; set; }
public bool IsIndexed { get; set; }
public bool IsAnalyzed { get; set; }
public float Boost { get; set; } = 1.0f;
public Field(string name, object value, FieldType type = FieldType.Text,
bool stored = true, bool indexed = true, bool analyzed = true)
{
Name = name;
Value = value;
Type = type;
IsStored = stored;
IsIndexed = indexed;
IsAnalyzed = analyzed;
}
}
public enum FieldType
{
Text,
Keyword,
Integer,
Float,
Date,
Boolean
}
// Text analyzer interface and implementation
public interface IAnalyzer
{
List<string> Analyze(string text);
}
public class StandardAnalyzer : IAnalyzer
{
private readonly HashSet<string> _stopWords = new HashSet<string>
{
"a", "an", "and", "are", "as", "at", "be", "but", "by", "for",
"if", "in", "into", "is", "it", "no", "not", "of", "on", "or",
"such", "that", "the", "their", "then", "there", "these", "they",
"this", "to", "was", "will", "with"
};
public List<string> Analyze(string text)
{
if (string.IsNullOrWhiteSpace(text))
return new List<string>();
// Convert to lowercase and extract words
var words = Regex.Matches(text.ToLower(), @"\b\w+\b")
.Cast<Match>()
.Select(m => m.Value)
.Where(word => word.Length > 2 && !_stopWords.Contains(word))
.ToList();
return words;
}
}
// Query classes
public abstract class Query
{
public float Boost { get; set; } = 1.0f;
public abstract FilterDefinition<IndexedDocument> ToMongoFilter();
}
public class TermQuery : Query
{
public string Field { get; set; }
public string Term { get; set; }
public TermQuery(string field, string term)
{
Field = field;
Term = term;
}
public override FilterDefinition<IndexedDocument> ToMongoFilter()
{
return Builders<IndexedDocument>.Filter.AnyEq($"Tokens.{Field}", Term);
}
}
public class BooleanQuery : Query
{
public List<BooleanClause> Clauses { get; set; } = new List<BooleanClause>();
public void Add(Query query, BooleanClause.Occur occur)
{
Clauses.Add(new BooleanClause(query, occur));
}
public override FilterDefinition<IndexedDocument> ToMongoFilter()
{
var mustFilters = new List<FilterDefinition<IndexedDocument>>();
var shouldFilters = new List<FilterDefinition<IndexedDocument>>();
var mustNotFilters = new List<FilterDefinition<IndexedDocument>>();
foreach (var clause in Clauses)
{
var filter = clause.Query.ToMongoFilter();
switch (clause.Occurrence)
{
case BooleanClause.Occur.Must:
mustFilters.Add(filter);
break;
case BooleanClause.Occur.Should:
shouldFilters.Add(filter);
break;
case BooleanClause.Occur.MustNot:
mustNotFilters.Add(filter);
break;
}
}
var combinedFilter = Builders<IndexedDocument>.Filter.Empty;
if (mustFilters.Any())
combinedFilter &= Builders<IndexedDocument>.Filter.And(mustFilters);
if (shouldFilters.Any())
combinedFilter &= Builders<IndexedDocument>.Filter.Or(shouldFilters);
if (mustNotFilters.Any())
combinedFilter &= Builders<IndexedDocument>.Filter.Not(
Builders<IndexedDocument>.Filter.Or(mustNotFilters));
return combinedFilter;
}
}
public class BooleanClause
{
public enum Occur { Must, Should, MustNot }
public Query Query { get; set; }
public Occur Occurrence { get; set; }
public BooleanClause(Query query, Occur occur)
{
Query = query;
Occurrence = occur;
}
}
public class PhraseQuery : Query
{
public string Field { get; set; }
public List<string> Terms { get; set; } = new List<string>();
public int Slop { get; set; } = 0;
public PhraseQuery(string field, params string[] terms)
{
Field = field;
Terms.AddRange(terms);
}
public override FilterDefinition<IndexedDocument> ToMongoFilter()
{
// For phrase queries, we'll use MongoDB text search or regex
var phrase = string.Join(" ", Terms);
return Builders<IndexedDocument>.Filter.Regex($"Fields.{Field}",
new BsonRegularExpression(Regex.Escape(phrase), "i"));
}
}
// Search results
public class SearchResult
{
public IndexedDocument Document { get; set; }
public float Score { get; set; }
public Dictionary<string, object> Fields { get; set; }
}
public class SearchResults
{
public List<SearchResult> Results { get; set; } = new List<SearchResult>();
public long TotalHits { get; set; }
public TimeSpan QueryTime { get; set; }
}
// Main index writer class
public class IndexWriter
{
private readonly IMongoCollection<IndexedDocument> _collection;
private readonly IAnalyzer _analyzer;
public IndexWriter(IMongoDatabase database, string indexName, IAnalyzer analyzer = null)
{
_collection = database.GetCollection<IndexedDocument>(indexName);
_analyzer = analyzer ?? new StandardAnalyzer();
// Create indexes for better performance
CreateIndexes();
}
private void CreateIndexes()
{
// Create text indexes on common fields
var indexKeysDefinition = Builders<IndexedDocument>.IndexKeys
.Text("Fields")
.Text("Tokens");
_collection.Indexes.CreateOne(new CreateIndexModel<IndexedDocument>(indexKeysDefinition));
// Create index on DocumentId for faster lookups
_collection.Indexes.CreateOne(new CreateIndexModel<IndexedDocument>(
Builders<IndexedDocument>.IndexKeys.Ascending(x => x.DocumentId)));
}
public async Task AddDocumentAsync(string documentId, params Field[] fields)
{
var doc = new IndexedDocument
{
DocumentId = documentId
};
foreach (var field in fields)
{
// Store the original value if needed
if (field.IsStored)
{
doc.Fields[field.Name] = field.Value;
}
// Analyze and tokenize text fields for indexing
if (field.IsIndexed && field.IsAnalyzed && field.Type == FieldType.Text)
{
var tokens = _analyzer.Analyze(field.Value?.ToString() ?? "");
doc.Tokens[field.Name] = tokens;
}
else if (field.IsIndexed)
{
// For non-analyzed fields, store as single token
doc.Tokens[field.Name] = new List<string> { field.Value?.ToString() ?? "" };
}
}
await _collection.ReplaceOneAsync(
Builders<IndexedDocument>.Filter.Eq(x => x.DocumentId, documentId),
doc,
new ReplaceOptions { IsUpsert = true });
}
public async Task DeleteDocumentAsync(string documentId)
{
await _collection.DeleteOneAsync(
Builders<IndexedDocument>.Filter.Eq(x => x.DocumentId, documentId));
}
public async Task CommitAsync()
{
// In MongoDB, writes are immediately consistent, so this is a no-op
// but we keep it for API compatibility
await Task.CompletedTask;
}
}
// Index searcher class
public class IndexSearcher
{
private readonly IMongoCollection<IndexedDocument> _collection;
public IndexSearcher(IMongoDatabase database, string indexName)
{
_collection = database.GetCollection<IndexedDocument>(indexName);
}
public async Task<SearchResults> SearchAsync(Query query, int maxResults = 10, int skip = 0)
{
var startTime = DateTime.UtcNow;
var filter = query.ToMongoFilter();
var totalCount = await _collection.CountDocumentsAsync(filter);
var documents = await _collection
.Find(filter)
.Skip(skip)
.Limit(maxResults)
.ToListAsync();
var results = documents.Select(doc => new SearchResult
{
Document = doc,
Score = CalculateScore(doc, query),
Fields = doc.Fields
}).OrderByDescending(r => r.Score).ToList();
return new SearchResults
{
Results = results,
TotalHits = totalCount,
QueryTime = DateTime.UtcNow - startTime
};
}
private float CalculateScore(IndexedDocument doc, Query query)
{
// Simple scoring algorithm - can be enhanced
// Based on document boost and query boost
return doc.Boost * query.Boost;
}
}
// Query parser for simple query strings
public class QueryParser
{
private readonly string _defaultField;
private readonly IAnalyzer _analyzer;
public QueryParser(string defaultField, IAnalyzer analyzer)
{
_defaultField = defaultField;
_analyzer = analyzer;
}
public Query Parse(string queryString)
{
if (string.IsNullOrWhiteSpace(queryString))
return new TermQuery(_defaultField, "");
// Simple parsing - can be enhanced for complex queries
queryString = queryString.Trim();
// Handle phrase queries (quoted strings)
if (queryString.StartsWith("\"") && queryString.EndsWith("\""))
{
var phrase = queryString.Substring(1, queryString.Length - 2);
var terms = _analyzer.Analyze(phrase);
return new PhraseQuery(_defaultField, terms.ToArray());
}
// Handle boolean queries (AND, OR, NOT)
if (queryString.Contains(" AND ") || queryString.Contains(" OR ") || queryString.Contains(" NOT "))
{
var boolQuery = new BooleanQuery();
var parts = Regex.Split(queryString, @"\s+(AND|OR|NOT)\s+", RegexOptions.IgnoreCase);
for (int i = 0; i < parts.Length; i += 2)
{
if (i < parts.Length)
{
var termQuery = new TermQuery(_defaultField, parts[i].Trim());
var occur = BooleanClause.Occur.Should; // Default
if (i > 0 && i - 1 < parts.Length)
{
var op = parts[i - 1].ToUpper();
occur = op switch
{
"AND" => BooleanClause.Occur.Must,
"OR" => BooleanClause.Occur.Should,
"NOT" => BooleanClause.Occur.MustNot,
_ => BooleanClause.Occur.Should
};
}
boolQuery.Add(termQuery, occur);
}
}
return boolQuery;
}
// Simple term query
return new TermQuery(_defaultField, queryString.ToLower());
}
}
// Usage example and factory class
public class MongoLuceneIndex
{
private readonly IMongoDatabase _database;
private readonly string _indexName;
public MongoLuceneIndex(string connectionString, string databaseName, string indexName)
{
var client = new MongoClient(connectionString);
_database = client.GetDatabase(databaseName);
_indexName = indexName;
}
public IndexWriter GetWriter(IAnalyzer analyzer = null)
{
return new IndexWriter(_database, _indexName, analyzer);
}
public IndexSearcher GetSearcher()
{
return new IndexSearcher(_database, _indexName);
}
public QueryParser GetQueryParser(string defaultField, IAnalyzer analyzer = null)
{
return new QueryParser(defaultField, analyzer ?? new StandardAnalyzer());
}
}
}