您现在的位置是：首页 > 工具

当前栏目

Lucene5学习之自定义同义词分词器简单示例

学习简单自定义示例分词器同义词

2023-09-14 08:59:37 时间

同义词功能在全文搜索时的意义，大家应该都懂的。今天中文我就试着写了一个同义词分词的示例demo，其实主要代码还是参考Lucene in Action 这本英文版书籍的随书代码，只不过Lucenen in Action书里的示例代码目前最新版只支持到Lucene4.x,对于Lucene5.x,代码需要稍作修改，下面是基于Lucene5.x的自定义同义词分词器demo:

public interface SynonymEngine {       String[] getSynonyms(String s) throws IOException;
public class BaseSynonymEngine implements SynonymEngine {       private static HashMap String, String[] map = new HashMap String, String[]               {           map.put("quick", new String[] {"fast","speedy"});           map.put("jumps", new String[] {"leaps","hops"});           map.put("over", new String[] {"above"});           map.put("lazy", new String[] {"apathetic","slugish"});           map.put("dog", new String[] {"canine","pooch"});       }       public String[] getSynonyms(String s) throws IOException {           return map.get(s);       }
import org.apache.lucene.analysis.TokenFilter;   import org.apache.lucene.analysis.TokenStream;   import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;   import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;   import org.apache.lucene.util.AttributeSource;   * 自定义同义词过滤器 * @author Lanxiaowei */   public class SynonymFilter extends TokenFilter {       public static final String TOKEN_TYPE_SYNONYM = "SYNONYM";       private Stack String synonymStack;       private SynonymEngine engine;       private AttributeSource.State current;       private final CharTermAttribute termAtt;       private final PositionIncrementAttribute posIncrAtt;       public SynonymFilter(TokenStream in, SynonymEngine engine) {           super(in);           synonymStack = new Stack String // #1           this.engine = engine;           this.termAtt = addAttribute(CharTermAttribute.class);           this.posIncrAtt = addAttribute(PositionIncrementAttribute.class);       }       public boolean incrementToken() throws IOException {           if (synonymStack.size() 0) { // #2               String syn = synonymStack.pop(); // #2               restoreState(current); // #2               // 这里Lucene4.x的写法               // termAtt.setTermBuffer(syn);               // 这是Lucene5.x的写法               termAtt.copyBuffer(syn.toCharArray(), 0, syn.length());               posIncrAtt.setPositionIncrement(0); // #3               return true;           }           if (!input.incrementToken()) // #4               return false;           if (addAliasesToStack()) { // #5               current = captureState(); // #6           }           return true; // #7       }       private boolean addAliasesToStack() throws IOException {           // 这里Lucene4.x的写法           // String[] synonyms = engine.getSynonyms(termAtt.term()); //#8           // 这里Lucene5.x的写法           String[] synonyms = engine.getSynonyms(termAtt.toString()); // #8           if (synonyms == null) {               return false;           }           for (String synonym : synonyms) { // #9               synonymStack.push(synonym);           }           return true;       }   #1 Define synonym buffer #2 Pop buffered synonyms #3 Set position increment to 0 #4 Read next token #5 Push synonyms onto stack #6 Save current token #7 Return current token #8 Retrieve synonyms #9 Push synonyms onto stack
import org.apache.lucene.analysis.Analyzer;   import org.apache.lucene.analysis.TokenStream;   import org.apache.lucene.analysis.Tokenizer;   import org.apache.lucene.analysis.Analyzer.TokenStreamComponents;   import org.apache.lucene.analysis.core.LetterTokenizer;   import org.apache.lucene.analysis.core.LowerCaseFilter;   import org.apache.lucene.analysis.core.StopAnalyzer;   import org.apache.lucene.analysis.core.StopFilter;   import org.apache.lucene.analysis.standard.StandardFilter;   import org.apache.lucene.analysis.standard.StandardTokenizer;   import com.yida.framework.lucene5.util.analyzer.codec.MetaphoneReplacementFilter;   * 自定义同义词分词器 * @author Lanxiaowei * @createTime 2015-03-31 10:15:23 */   public class SynonymAnalyzer extends Analyzer {       private SynonymEngine engine;       public SynonymAnalyzer(SynonymEngine engine) {           this.engine = engine;       }       @Override       protected TokenStreamComponents createComponents(String text) {           Tokenizer tokenizer = new StandardTokenizer();           TokenStream tokenStream = new SynonymFilter(tokenizer, engine);           tokenStream = new LowerCaseFilter(tokenStream);           tokenStream = new StopFilter(tokenStream,StopAnalyzer.ENGLISH_STOP_WORDS_SET);           return new TokenStreamComponents(tokenizer, tokenStream);       }
public class SynonymAnalyzerTest {       public static void main(String[] args) throws IOException {           String text = "The quick brown fox jumps over the lazy dog";           Analyzer analyzer = new SynonymAnalyzer(new BaseSynonymEngine());           AnalyzerUtils.displayTokens(analyzer, text);       }
import org.apache.lucene.analysis.Analyzer;   import org.apache.lucene.analysis.TokenStream;   import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;   import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;   import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;   import org.apache.lucene.analysis.tokenattributes.TypeAttribute;   * 用于分词器测试的一个简单工具类(用于打印分词情况，包括Term的起始位置和结束位置(即所谓的偏 * 移量)，位置增量，Term字符串，Term字符串类型(字符串/阿拉伯数字之类的)) * @author Lanxiaowei */   public class AnalyzerUtils {       public static void displayTokens(Analyzer analyzer,String text) throws IOException {           TokenStream tokenStream = analyzer.tokenStream("text", text);           displayTokens(tokenStream);       }              public static void displayTokens(TokenStream tokenStream) throws IOException {           OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class);           PositionIncrementAttribute positionIncrementAttribute = tokenStream.addAttribute(PositionIncrementAttribute.class);           CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);           TypeAttribute typeAttribute = tokenStream.addAttribute(TypeAttribute.class);                      tokenStream.reset();           int position = 0;           while (tokenStream.incrementToken()) {               int increment = positionIncrementAttribute.getPositionIncrement();               if(increment 0) {                   position = position + increment;                   System.out.print(position + ":");               }               int startOffset = offsetAttribute.startOffset();               int endOffset = offsetAttribute.endOffset();               String term = charTermAttribute.toString();               System.out.println("[" + term + "]" + ":(" + startOffset + "-- " + endOffset + "):" + typeAttribute.type());           }       }              /**      * 断言分词结果      * @param analyzer      * @param text        源字符串      * @param expecteds   期望分词后结果      * @throws IOException        */       public static void assertAnalyzerTo(Analyzer analyzer,String text,String[] expecteds) throws IOException {           TokenStream tokenStream = analyzer.tokenStream("text", text);           CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);           for(String expected : expecteds) {               Assert.assertTrue(tokenStream.incrementToken());               Assert.assertEquals(expected, charTermAttribute.toString());           }           Assert.assertFalse(tokenStream.incrementToken());           tokenStream.close();       }

以上代码都是Lucene in Action这本书里面的示例代码，我只不过是基于Lucene5.x把它重写并调试成功了，特此分享，希望对正在学习Lucene5的童鞋们有所帮助。demo代码我会在底下附件里上传，有需要demo源码的请自己在底下的附件里下载，Lucene in Action这本书的随书源码我已上传到我的百度网盘，也一并分享给大家，Lucene in Action随书源码百度网盘下载地址：

戳我，戳我，快戳我！！！Come on.

千言万语都在代码中，就不多说了，打完收工！

如果你还有什么问题请加我Ｑ-Q：7-3-6-0-3-1-3-0-5，

或者加裙
一起交流学习！

转载：http://iamyida.iteye.com/blog/2197355

猜你喜欢

SQL sqlserver order by 1,order by 后面直接加数字，多个字段排序
2022年除夕不是年三十你发现了吗？除夕倒计时在手机便签上设置
815. 打印字符串
Python——面向对象编程（十一）
虚假新闻检测（CANMD）《Contrastive Domain Adaptation for Early Misinformation Detection: A Case Study on COVID-19》
nginx 前端调度对后端的app的生存状态的检测
NS_ASSUME_NONNULL_BEGIN NS_ASSUME_NONNULL_END
ansible使用setup模块查看受控机的信息(ansible2.9.5)
怎样让Echarts默认显示全部的数据标签
Android 开发之旅：深入分析布局文件&又是“Hello World！”
CRM客户主数据UI上有哪些字段可以触发partner determination
第二百一十二节，jQuery EasyUI，Combo(自定义下拉框)组件
如何入门 Python 爬虫？详细教程在这里
智能客服搭建（0）——写在前面
LabVIEW窗口保持在最前端
【Ansible自动化运维工具】ansible的角色基本使用
【关于ChatGPT的30个问题】20、ChatGPT是否会被用于恶意目的？/ By 禅与计算机程序设计艺术
【C/C++学院】0828-数组与指针/内存分配/数据结构数组接口与封装
计算机的指令系统
COM笔记-包容与聚合
OVS 总体架构、源码结构及数据流程全面解析

相关主题

python学习笔记
LinQ的学习(一)
大学生学习
学习学习
react学习
[机器学习] 集成学习
机器学习之线性回归
机器学习和统计学习
机器学习之深度学习
学习学习中
知识学习笔记
安卓学习笔记2
java 学习连接

zl程序教程

当前栏目

Lucene5学习之自定义同义词分词器简单示例

相关文章