本文简单介绍Lucene.Net实现GroupBy效果的方法,与《Lucene.Net 按类别统计搜索结果数 》一文类似。注意,这种使用方法很影响效率,特别是命中结果多的情况下。这段代码修正自2.3.1版本,其它版本可能会与此有差别。
改造方法仍然是修改IndexSearcher,这里不再修改类库,而是通过自己的代码来实现。
/// <summary>
/// 增加了GroupBy功能的IndexSearcher
/// </summary>
public class IndexSearcherExtension : IndexSearcher
{
/// <summary>
/// 这里只用这一个构造函数,其它的就不再列出。
/// </summary>
/// <param name="path"></param>
public IndexSearcherExtension(string path) : base(path) { }
/// <summary>
/// 增加GroupBy字段
/// </summary>
private string fieldName;
/// <summary>
/// 给TopDocCollectorExtension类的Collect方法使用。
/// </summary>
public string FieldName {
get { return fieldName; }
}
/// <summary>
/// 在调用Search方法前一定要调用该方法。
/// </summary>
/// <param name="fieldName"></param>
public void GroupBy(string fieldName) {
this.fieldName = fieldName;
}
/// <summary>
/// 重写Seach方法,使其能调用构造好的方法。
/// </summary>
/// <param name="weight"></param>
/// <param name="filter"></param>
/// <param name="nDocs"></param>
/// <returns></returns>
public override TopDocs Search(Weight weight, Filter filter, int nDocs)
{
if (nDocs <= 0)
// null might be returned from hq.top() below.
throw new System.ArgumentException("nDocs must be > 0");
TopDocCollectorExtension collector = new TopDocCollectorExtension(nDocs, this);
Search(weight, filter, collector);
return collector.TopDocs();
}
}
/// <summary>
/// 实现与 HitQueue类完全一致,只因为这里无法使用类库提供的构造函数
/// </summary>
public class HitQueueExtension : PriorityQueue
{
internal HitQueueExtension(int size)
{
Initialize(size);
}
public override bool LessThan(System.Object a, System.Object b)
{
ScoreDoc hitA = (ScoreDoc) a;
ScoreDoc hitB = (ScoreDoc) b;
if (hitA.score == hitB.score)
return hitA.doc > hitB.doc;
else
return hitA.score < hitB.score;
}
}
/// 增加新的TopDocCollector类,无法直接继承TopDocCollector
/// </summary>
public class TopDocCollectorExtension : HitCollector
{
private ScoreDoc reusableSD;
internal int totalHits;
internal PriorityQueue hq;
/// <summary>Construct to collect a given number of hits.</summary>
/// <param name="numHits">the maximum number of hits to collect
/// </param>
public TopDocCollectorExtension(int numHits)
: this(numHits, new HitQueueExtension(numHits))
{
}
/// <summary>
/// 注入IndexSearcherExtension对象
/// </summary>
private IndexSearcherExtension searcher;
/// <summary>
/// 构造函数注入对象
/// </summary>
/// <param name="numHits"></param>
/// <param name="searcher"></param>
public TopDocCollectorExtension(int numHits, IndexSearcherExtension searcher)
: this(numHits)
{
this.searcher = searcher;
}
internal TopDocCollectorExtension(int numHits, PriorityQueue hq)
{
this.hq = hq;
}
/// <summary>
/// 临时数据,用于排重
/// </summary>
private Dictionary<int, int> dict = new Dictionary<int, int>();
// javadoc inherited
public override void Collect(int doc, float score)
{
if (score > 0.0f)
{
//排重算法
if (!string.IsNullOrEmpty(searcher.FieldName))
{
IndexReader reader = searcher.GetIndexReader();
Document docment = reader.Document(doc);
string value = docment.Get(searcher.FieldName).Trim();
string value1 = string.Empty;
string value2 = string.Empty;
int len = value.Length;
int len1 = (int)Math.Ceiling(len / 2.0f);
int len2 = len – len1;
int hash1 = value.Substring(0, len1).GetHashCode();
int hash2 = value.Substring(len1, len2).GetHashCode();
if (!(dict.ContainsKey(hash1) && dict.ContainsValue(hash2)))
dict.Add(hash1, hash2);
else
return;
}
totalHits++;
if (reusableSD == null)
{
reusableSD = new ScoreDoc(doc, score);
}
else if (score >= reusableSD.score)
{
// reusableSD holds the last "rejected" entry, so, if
// this new score is not better than that, there's no
// need to try inserting it
reusableSD.doc = doc;
reusableSD.score = score;
}
else
{
return;
}
reusableSD = (ScoreDoc)hq.InsertWithOverflow(reusableSD);
}
}
/// <summary>The total number of documents that matched this query. </summary>
public virtual int GetTotalHits()
{
return totalHits;
}
/// <summary>The top-scoring hits. </summary>
public virtual TopDocs TopDocs()
{
ScoreDoc[] scoreDocs = new ScoreDoc[hq.Size()];
for (int i = hq.Size() – 1; i >= 0; i—)
// put docs in array
scoreDocs[i] = (ScoreDoc)hq.Pop();
float maxScore = (totalHits == 0) ? System.Single.NegativeInfinity : scoreDocs[0].score;
return new TopDocs(totalHits, scoreDocs, maxScore);
}
}
{
IndexWriter writer = new IndexWriter("e:\\index", new StandardAnalyzer(), true);
Document doc = new Document();
doc.Add(new Field("field", "query value!", Field.Store.YES, Field.Index.TOKENIZED));
writer.AddDocument(doc);
writer.AddDocument(doc);
writer.AddDocument(doc);
writer.Close();
IndexSearcherExtension searcher = new IndexSearcherExtension("e:\\index");
searcher.GroupBy("field");
Query q = new QueryParser("field", new StandardAnalyzer())
.Parse("query");
Hits docs = searcher.Search(q);
for (int i = 0; i < docs.Length(); i++)
{
Console.WriteLine(docs.Doc(i).Get("field"));
}
searcher.Close();
Console.ReadKey();
}
添加了三个相同的文档,结果只查询到一个结果,从而达到了目的。这段修改比较简单,应该还可以设计出更加高效的算法。
Mikel




















Replacing the default ViewEngine with a custom made version is actually quite easy! The most difficult part in creating your own ViewEngine implementation will probably be the parsing of your view. Fortunately, there are some examples around which may be a good source of inspiration (see