Lucene5学习之自定义同义词分词器简单示例
学习 简单 自定义 示例 分词器 同义词
2023-09-14 08:59:37 时间
同义词功能在全文搜索时的意义,大家应该都懂的。今天中文我就试着写了一个同义词分词的示例demo,其实主要代码还是参考Lucene in Action 这本英文版书籍的随书代码,只不过Lucenen in Action书里的示例代码目前最新版只支持到Lucene4.x,对于Lucene5.x,代码需要稍作修改,下面是基于Lucene5.x的自定义同义词分词器demo:
public interface SynonymEngine { String[] getSynonyms(String s) throws IOException;
public class BaseSynonymEngine implements SynonymEngine { private static HashMap String, String[] map = new HashMap String, String[] { map.put("quick", new String[] {"fast","speedy"}); map.put("jumps", new String[] {"leaps","hops"}); map.put("over", new String[] {"above"}); map.put("lazy", new String[] {"apathetic","slugish"}); map.put("dog", new String[] {"canine","pooch"}); } public String[] getSynonyms(String s) throws IOException { return map.get(s); }
import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.util.AttributeSource; * 自定义同义词过滤器 * @author Lanxiaowei */ public class SynonymFilter extends TokenFilter { public static final String TOKEN_TYPE_SYNONYM = "SYNONYM"; private Stack String synonymStack; private SynonymEngine engine; private AttributeSource.State current; private final CharTermAttribute termAtt; private final PositionIncrementAttribute posIncrAtt; public SynonymFilter(TokenStream in, SynonymEngine engine) { super(in); synonymStack = new Stack String // #1 this.engine = engine; this.termAtt = addAttribute(CharTermAttribute.class); this.posIncrAtt = addAttribute(PositionIncrementAttribute.class); } public boolean incrementToken() throws IOException { if (synonymStack.size() 0) { // #2 String syn = synonymStack.pop(); // #2 restoreState(current); // #2 // 这里Lucene4.x的写法 // termAtt.setTermBuffer(syn); // 这是Lucene5.x的写法 termAtt.copyBuffer(syn.toCharArray(), 0, syn.length()); posIncrAtt.setPositionIncrement(0); // #3 return true; } if (!input.incrementToken()) // #4 return false; if (addAliasesToStack()) { // #5 current = captureState(); // #6 } return true; // #7 } private boolean addAliasesToStack() throws IOException { // 这里Lucene4.x的写法 // String[] synonyms = engine.getSynonyms(termAtt.term()); //#8 // 这里Lucene5.x的写法 String[] synonyms = engine.getSynonyms(termAtt.toString()); // #8 if (synonyms == null) { return false; } for (String synonym : synonyms) { // #9 synonymStack.push(synonym); } return true; } #1 Define synonym buffer #2 Pop buffered synonyms #3 Set position increment to 0 #4 Read next token #5 Push synonyms onto stack #6 Save current token #7 Return current token #8 Retrieve synonyms #9 Push synonyms onto stack
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.Analyzer.TokenStreamComponents; import org.apache.lucene.analysis.core.LetterTokenizer; import org.apache.lucene.analysis.core.LowerCaseFilter; import org.apache.lucene.analysis.core.StopAnalyzer; import org.apache.lucene.analysis.core.StopFilter; import org.apache.lucene.analysis.standard.StandardFilter; import org.apache.lucene.analysis.standard.StandardTokenizer; import com.yida.framework.lucene5.util.analyzer.codec.MetaphoneReplacementFilter; * 自定义同义词分词器 * @author Lanxiaowei * @createTime 2015-03-31 10:15:23 */ public class SynonymAnalyzer extends Analyzer { private SynonymEngine engine; public SynonymAnalyzer(SynonymEngine engine) { this.engine = engine; } @Override protected TokenStreamComponents createComponents(String text) { Tokenizer tokenizer = new StandardTokenizer(); TokenStream tokenStream = new SynonymFilter(tokenizer, engine); tokenStream = new LowerCaseFilter(tokenStream); tokenStream = new StopFilter(tokenStream,StopAnalyzer.ENGLISH_STOP_WORDS_SET); return new TokenStreamComponents(tokenizer, tokenStream); }
public class SynonymAnalyzerTest { public static void main(String[] args) throws IOException { String text = "The quick brown fox jumps over the lazy dog"; Analyzer analyzer = new SynonymAnalyzer(new BaseSynonymEngine()); AnalyzerUtils.displayTokens(analyzer, text); }
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.TypeAttribute; * 用于分词器测试的一个简单工具类(用于打印分词情况,包括Term的起始位置和结束位置(即所谓的偏 * 移量),位置增量,Term字符串,Term字符串类型(字符串/阿拉伯数字之类的)) * @author Lanxiaowei */ public class AnalyzerUtils { public static void displayTokens(Analyzer analyzer,String text) throws IOException { TokenStream tokenStream = analyzer.tokenStream("text", text); displayTokens(tokenStream); } public static void displayTokens(TokenStream tokenStream) throws IOException { OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class); PositionIncrementAttribute positionIncrementAttribute = tokenStream.addAttribute(PositionIncrementAttribute.class); CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); TypeAttribute typeAttribute = tokenStream.addAttribute(TypeAttribute.class); tokenStream.reset(); int position = 0; while (tokenStream.incrementToken()) { int increment = positionIncrementAttribute.getPositionIncrement(); if(increment 0) { position = position + increment; System.out.print(position + ":"); } int startOffset = offsetAttribute.startOffset(); int endOffset = offsetAttribute.endOffset(); String term = charTermAttribute.toString(); System.out.println("[" + term + "]" + ":(" + startOffset + "-- " + endOffset + "):" + typeAttribute.type()); } } /** * 断言分词结果 * @param analyzer * @param text 源字符串 * @param expecteds 期望分词后结果 * @throws IOException */ public static void assertAnalyzerTo(Analyzer analyzer,String text,String[] expecteds) throws IOException { TokenStream tokenStream = analyzer.tokenStream("text", text); CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); for(String expected : expecteds) { Assert.assertTrue(tokenStream.incrementToken()); Assert.assertEquals(expected, charTermAttribute.toString()); } Assert.assertFalse(tokenStream.incrementToken()); tokenStream.close(); }
以上代码都是Lucene in Action这本书里面的示例代码,我只不过是基于Lucene5.x把它重写并调试成功了,特此分享,希望对正在学习Lucene5的童鞋们有所帮助。demo代码我会在底下附件里上传,有需要demo源码的请自己在底下的附件里下载,Lucene in Action这本书的随书源码我已上传到我的百度网盘,也一并分享给大家,Lucene in Action随书源码百度网盘下载地址:
千言万语都在代码中,就不多说了,打完收工!
如果你还有什么问题请加我Q-Q:7-3-6-0-3-1-3-0-5,
或者加裙一起交流学习!
转载:http://iamyida.iteye.com/blog/2197355
相关文章
- JBPM学习(一):实现一个简单的工作流例子全过程
- 机器学习如何改变大数据管理
- 简单易学的机器学习算法—基于密度的聚类算法DBSCAN
- RestfulApi 学习笔记——简单介绍(一)
- python标准库学习7
- zeromq学习笔记2——简单的客户端和服务端测试程序
- 简单易学的机器学习算法—基于密度的聚类算法DBSCAN
- 前端学习 -- Css -- 否定伪类
- 前端学习 -- image标签和meta标签
- Deep Learning(深度学习)网络资源
- 【学习总结】win7下安装Ubuntu双系统的日常
- Opencv学习笔记 简单形状检测
- Opencv学习笔记 均方误差(MSE)、结构相似度指数(SSIM)
- Android开发学习笔记(十二)Fragment简单介绍
- 通过最简单的button控件,深入学习SAP UI5框架代码系列之零
- 一个最简单的例子学习SAP Cloud for Customer HTML mashup
- 【python的学习之路】:你的顿悟可能只是别人的基本功,所以捷径,资源真的很重要
- 设计模式学习笔记-简单工厂模式
- 逃逸机器学习的安全检测——evadeML、malGAN、deep-pwning、foolbox、Gym-Malware,防御的话有Defense-GAN: Protecting Classifiers Against Adversarial Attacks Using Generative Models(生成式模型)
- HarmonyOS鸿蒙学习笔记(12)@Link的作用
- HarmonyOS鸿蒙学习笔记(3)@Component注解自定义组件简单说明
- AJAX学习记录(附带对原理的剖析及简单的封装使用)
- MOSFET 和 IGBT 栅极驱动器电路的基本原理学习笔记(四)高侧非隔离栅极驱动
- AXI 总线协议学习笔记(1)
- 学习计算机视觉需要哪些数学基础?
- 学习经验分享【24】全网最简单标注数据集方法