您现在的位置是：首页 > 后端

当前栏目

LDA主题模型的java代码实现详解大数据

JAVA 数据代码实现详解模型主题 LDA

2023-06-13 09:20:26 时间

/**Get parameters from configuring file. If the * configuring file has value in it, use the value. * Else the default value in program will be used * @param ldaparameters * @param parameterFile * @return void private static void getParametersFromFile(modelparameters ldaparameters, String parameterFile) { // TODO Auto-generated method stub ArrayList String paramLines = new ArrayList String paramLines = FileUtil.readList(parameterFile); for(String line : paramLines){ String[] lineParts = line.split("/t"); switch(parameters.valueOf(lineParts[0])){ case alpha: ldaparameters.alpha = Float.valueOf(lineParts[1]); break; case beta: ldaparameters.beta = Float.valueOf(lineParts[1]); break; case topicNum: ldaparameters.topicNum = Integer.valueOf(lineParts[1]); break; case iteration: ldaparameters.iteration = Integer.valueOf(lineParts[1]); break; case saveStep: ldaparameters.saveStep = Integer.valueOf(lineParts[1]); break; case beginSaveIters: ldaparameters.beginSaveIters = Integer.valueOf(lineParts[1]); break; public enum parameters{ alpha, beta, topicNum, iteration, saveStep, beginSaveIters; /** * 训练LDA主题模型，对给定的测试样本集进行主题预测，找出每个样本的最大概率主题下的前20个词的集合，作为该测试样本集的主题代表关键词集合 * @param trainPathDir * @param parameterFile * @param resultPath * @param testPath * @return * @throws IOException public Set Word trainAndPredictLDA(String trainPathDir,String parameterFile,String resultPath,String testPath) throws IOException{ modelparameters ldaparameters = new modelparameters(); getParametersFromFile(ldaparameters, parameterFile); Documents docSet = new Documents(); docSet.readDocs(trainPathDir); System.out.println("wordMap size " + docSet.termToIndexMap.size()); FileUtil.mkdir(resultPath); LdaModel model = new LdaModel(ldaparameters); System.out.println("1 Initialize the model ..."); model.initializeModel(docSet); System.out.println("2 Learning and Saving the model ..."); model.inferenceModel(docSet); System.out.println("3 Output the final model ..."); // model.saveIteratedModel(ldaparameters.iteration, docSet); // System.out.println("Done!"); //预测新文本 Documents testDocs = new Documents(); List Message messages = FileUtil.readMessageFromFile(testPath); Set Integer topicIndexSet = new HashSet Integer for(Message message : messages){ String content = message.getContent(); Document doc = new Document(content); testDocs.docs.add(doc); topicIndexSet.add(model.predictNewSampleTopic(doc)); /** * 预测每条短信，得到每条的最大概率主题，最后找到每个最大概率主题的前20个词，集合,计算tf-idf Set Word wordSet = model.getWordByTopics(topicIndexSet, 20); LDAFeatureProcess.calTFIDFAsWeight(docSet, wordSet); return wordSet; @Test public void test() throws IOException{ String resultPath = "ldaResult/"; String parameterFile= "source/lda_parameters.txt"; String trainPathDir = "LDATrain/"; String testPath = "train/train_messages.txt"; Set Word wordSet = trainAndPredictLDA(trainPathDir,parameterFile,resultPath,testPath); FileUtil.writeKeyWordFile("ldaWords/keyWords.doc", new ArrayList Word (wordSet));
public static void main(String[] args) throws IOException { // TODO Auto-generated method stub String resultPath = "ldaResult/"; String parameterFile= "source/lda_parameters.txt"; modelparameters ldaparameters = new modelparameters(); getParametersFromFile(ldaparameters, parameterFile); String dirPath = "LDATrain/"; Documents docSet = new Documents(); docSet.readDocs(dirPath); System.out.println("wordMap size " + docSet.termToIndexMap.size()); FileUtil.mkdir(resultPath); LdaModel model = new LdaModel(ldaparameters); System.out.println("1 Initialize the model ..."); model.initializeModel(docSet); System.out.println("2 Learning and Saving the model ..."); model.inferenceModel(docSet); System.out.println("3 Output the final model ..."); model.saveIteratedModel(ldaparameters.iteration, docSet); System.out.println("Done!"); //预测新文本 String messStr = "好消息！！薇町婚纱造型推出老带新活动啦！已在本店预定的新娘推荐新顾客来本店，定单后即赠送新、老顾客各一支价值58元定妆隔离水（在婚礼当"; Document doc = new Document(messStr); int topicIndex = model.predictNewSampleTopic(doc); Set Word wordSet = model.getWordByTopic(topicIndex); FileUtil.writeKeyWordFile("ldaWords/comparedkeyWords.doc", new ArrayList Word (wordSet));

public class LdaModel { 

 int [][] doc;//word index array 

 int V, K, M;//vocabulary size, topic number, document number 

 int [][] z;//topic label array 

 float alpha; //doc-topic dirichlet prior parameter 

 float beta; //topic-word dirichlet prior parameter 

 int [][] nmk;//given document m, count times of topic k. M*K 

 int [][] nkt;//given topic k, count times of term t. K*V 

 int [] nmkSum;//Sum for each row in nmk 

 int [] nktSum;//Sum for each row in nkt 

 double [][] phi;//Parameters for topic-word distribution K*V 

 double [][] theta;//Parameters for doc-topic distribution M*K 

 int iterations;//Times of iterations 

 int saveStep;//The number of iterations between two saving 

 int beginSaveIters;//Begin save model at this iteration 

 Map String, Integer wordIndexMap; 

 Documents docSet; 

 public LdaModel(LdaGibbsSampling.modelparameters modelparam) { 

 // TODO Auto-generated constructor stub 

 alpha = modelparam.alpha; 

 beta = modelparam.beta; 

 iterations = modelparam.iteration; 

 K = modelparam.topicNum; 

 saveStep = modelparam.saveStep; 

 beginSaveIters = modelparam.beginSaveIters; 

 public void initializeModel(Documents docSet) { 

 this.docSet = docSet; 

 // TODO Auto-generated method stub 

 M = docSet.docs.size(); 

 V = docSet.termToIndexMap.size(); 

 nmk = new int [M][K]; 

 nkt = new int[K][V]; 

 nmkSum = new int[M]; 

 nktSum = new int[K]; 

 phi = new double[K][V]; 

 theta = new double[M][K]; 

 this.wordIndexMap = new HashMap String, Integer 

 //initialize documents index array 

 doc = new int[M][]; 

 for(int m = 0; m m++){ 

 //Notice the limit of memory 

 int N = docSet.docs.get(m).docWords.length; 

 doc[m] = new int[N]; 

 for(int n = 0; n n++){ 

 doc[m][n] = docSet.docs.get(m).docWords[n]; 

 //initialize topic lable z for each word 

 z = new int[M][]; 

 for(int m = 0; m m++){ 

 int N = docSet.docs.get(m).docWords.length; 

 z[m] = new int[N]; 

 for(int n = 0; n n++){ 

 //随机初始化！ 

 int initTopic = (int)(Math.random() * K);// From 0 to K - 1 

 z[m][n] = initTopic; 

 //number of words in doc m assigned to topic initTopic add 1 

 nmk[m][initTopic]++; 

 //number of terms doc[m][n] assigned to topic initTopic add 1 

 nkt[initTopic][doc[m][n]]++; 

 // total number of words assigned to topic initTopic add 1 

 nktSum[initTopic]++; 

 // total number of words in document m is N 

 nmkSum[m] = N; 

 public void inferenceModel(Documents docSet) throws IOException { 

 // TODO Auto-generated method stub 

 if(iterations saveStep + beginSaveIters){ 

 System.err.println("Error: the number of iterations should be larger than " + (saveStep + beginSaveIters)); 

 System.exit(0); 

 for(int i = 0; i iterations; i++){ 

 System.out.println("Iteration " + i); 

 if((i = beginSaveIters) (((i - beginSaveIters) % saveStep) == 0)){ 

 //Saving the model 

 System.out.println("Saving model at iteration " + i +" ... "); 

 //Firstly update parameters 

 updateEstimatedParameters(); 

 //Secondly print model variables 

 saveIteratedModel(i, docSet); 

 //Use Gibbs Sampling to update z[][] 

 for(int m = 0; m m++){ 

 int N = docSet.docs.get(m).docWords.length; 

 for(int n = 0; n n++){ 

 // Sample from p(z_i|z_-i, w) 

 int newTopic = sampleTopicZ(m, n); 

 z[m][n] = newTopic; 

 private void updateEstimatedParameters() { 

 // TODO Auto-generated method stub 

 for(int k = 0; k k++){ 

 for(int t = 0; t t++){ 

 phi[k][t] = (nkt[k][t] + beta) / (nktSum[k] + V * beta); 

 for(int m = 0; m m++){ 

 for(int k = 0; k k++){ 

 theta[m][k] = (nmk[m][k] + alpha) / (nmkSum[m] + K * alpha); 

 private int sampleTopicZ(int m, int n) { 

 // TODO Auto-generated method stub 

 // Sample from p(z_i|z_-i, w) using Gibbs upde rule 

 //Remove topic label for w_{m,n} 

 int oldTopic = z[m][n]; 

 nmk[m][oldTopic]--; 

 nkt[oldTopic][doc[m][n]]--; 

 nmkSum[m]--; 

 nktSum[oldTopic]--; 

 //Compute p(z_i = k|z_-i, w) 

 double [] p = new double[K]; 

 for(int k = 0; k k++){ 

 p[k] = (nkt[k][doc[m][n]] + beta) / (nktSum[k] + V * beta) * (nmk[m][k] + alpha) / (nmkSum[m] + K * alpha); 

 //Sample a new topic label for w_{m, n} like roulette 

 //Compute cumulated probability for p 

 for(int k = 1; k k++){ 

 p[k] += p[k - 1]; 

 double u = Math.random() * p[K - 1]; //p[] is unnormalised 

 int newTopic; 

 for(newTopic = 0; newTopic newTopic++){ 

 if(u p[newTopic]){ 

 break; 

 //Add new topic label for w_{m, n} 

 nmk[m][newTopic]++; 

 nkt[newTopic][doc[m][n]]++; 

 nmkSum[m]++; 

 nktSum[newTopic]++; 

 return newTopic; 

 /** 

 * 对给定的待预测的文本，将其分词结果的单词与训练集的单词的索引对应上 

 * @param predictWordSet 

 * @return 

 public Map String,String matchTermIndex(Set Word predictWordSet){ 

 /** 

 * key:word的内容 value：文档index-单词index，如“1-2” 

 Map String,String wordIndexMap = new HashMap String, String 

 for(Word word : predictWordSet){ 

 String content = word.getContent(); 

 String indexStr = getTermIndex(content); 

 wordIndexMap.put(content, indexStr); 

 return wordIndexMap; 

 /** 

 * 对于给定单词，找到该单词在训练集中对应的文档和单词索引 

 * @param content 

 * @return 

 public String getTermIndex(String content){ 

 for(Integer m : docSet.getDocWordsList().keySet()){ 

 LinkedList String list = docSet.getDocWordsList().get(m); 

 for(int i = 0; i list.size(); i ++){ 

 if(list.get(i).equals(content)) 

 return m+"-"+i; 

 return "none"; 

 /** 

 * 在训练完LDA模型后，根据给定的主题索引set，得到每个主题的topNum单词列表集合 

 * @param topicIndexSet 

 * @param topNum 

 * @return 

 public Set Word getWordByTopics(Set Integer topicIndexSet, int topNum){ 

 Set Word wordSet = new HashSet Word 

 for(Integer indexT : topicIndexSet){ 

 List Integer tWordsIndexArray = new ArrayList Integer 

 for(int j = 0; j j++) 

 tWordsIndexArray.add(new Integer(j)); 

 Collections.sort(tWordsIndexArray, new LdaModel.TwordsComparable(phi[indexT])); 

 for(int t = 0; t topNum; t++){ 

 String content = docSet.indexToTermMap.get(tWordsIndexArray.get(t)); 

 Word word = new Word(content); 

 if(SegmentWordsResult.getStopWordsSet().contains(content)|| 

 ProcessKeyWords.remove(word) || ProcessKeyWords.isMeaninglessWord(content)) 

 continue; 

 wordSet.add(word); 

 return wordSet; 

 public Set Word getWordByTopic(Integer topicIndex){ 

 Set Word wordSet = new HashSet Word 

 List Integer tWordsIndexArray = new ArrayList Integer 

 for(int j = 0; j j++){ 

 tWordsIndexArray.add(new Integer(j)); 

 Collections.sort(tWordsIndexArray, new LdaModel.TwordsComparable(phi[topicIndex])); 

 for(int t = 0; t t++){ 

 String content = docSet.indexToTermMap.get(tWordsIndexArray.get(t)); 

 Word word = new Word(content); 

 word.setWeight(phi[topicIndex][tWordsIndexArray.get(t)]); 

 if(SegmentWordsResult.getStopWordsSet().contains(content)|| 

 ProcessKeyWords.remove(word) || ProcessKeyWords.isMeaninglessWord(content)) 

 continue; 

 if(phi[topicIndex][tWordsIndexArray.get(t)] = 0.0) 

 continue; 

 wordSet.add(word); 

 return wordSet; 


 double topicProb[] = new double[K]; 

 Map String,String wordIndexMap = matchTermIndex(doc.getWordMap().keySet()); 

 int predict_v = doc.getWordCount(); 

 int [][] predict_nkt;//given topic k, count times of term t. K*V 

 double [][] predict_phi;//Parameters for topic-word distribution K*V 

 int [] predict_z;//topic label array 

 int [] predict_nk;//该文档覆盖的主题索引，值为该文档覆盖指定主题的次数 

 predict_nkt = new int[K][predict_v]; 

 predict_phi = new double[K][predict_v]; 

 predict_z = new int[predict_v]; 

 predict_nk = new int[K]; 

 for(int index = 0; index predict_v; index++){ 

 String content = doc.getWordsList().get(index); 

 String indexStr = wordIndexMap.get(content); 

 if(indexStr.indexOf("-") == -1) 

 continue; 

 int m = Integer.valueOf(indexStr.substring(0, indexStr.indexOf("-"))); 

 int n = Integer.valueOf(indexStr.substring(indexStr.indexOf("-")+1)); 

 // Sample from p(z_i|z_-i, w) 

 int newTopic = predictSampleTopicZ(m, n); 

 predict_z[index] = newTopic; 

 predict_nkt[newTopic][index] ++; 

 predict_nk[newTopic] ++; 

 for(int k = 0; k k++){ 

 topicProb[k] = (predict_nk[k] + alpha) / (predict_v + K * alpha); 

 return getTopic(topicProb); 

 public int getTopic(double[] topicProp){ 

 int maxIndex = 0; 

 double maxProp = topicProp[0]; 

 Set String words = new HashSet String 

 for(int k = 1; k k ++){ 

 if(maxProp topicProp[k]){ 

 maxProp = topicProp[k]; 

 maxIndex = k; 

 return maxIndex; 

 public int predictSampleTopicZ(int m, int n){ 

 // TODO Auto-generated method stub 

 // Sample from p(z_i|z_-i, w) using Gibbs upde rule 

 //Compute p(z_i = k|z_-i, w) 

 double [] p = new double[K]; 

 for(int k = 0; k k++){ 

 p[k] = (nkt[k][doc[m][n]] + beta) / (nktSum[k] + V * beta) * (nmk[m][k] + alpha) / (nmkSum[m] + K * alpha); 

 //Sample a new topic label for w_{m, n} like roulette 

 //Compute cumulated probability for p 

 for(int k = 1; k k++){ 

 p[k] += p[k - 1]; 

 double u = Math.random() * p[K - 1]; //p[] is unnormalised 

 int newTopic; 

 for(newTopic = 0; newTopic newTopic++){ 

 if(u p[newTopic]){ 

 break; 

 //Add new topic label for w_{m, n} 

 return newTopic; 

 public void saveIteratedModel(int iters, Documents docSet) throws IOException { 

 // TODO Auto-generated method stub 

 //lda.params lda.phi lda.theta lda.tassign lda.twords 

 //lda.params 

 String resultPath = "ldaResult/"; 

 String modelName = "lda_" + iters; 

 ArrayList String lines = new ArrayList String 

 lines.add("alpha = " + alpha); 

 lines.add("beta = " + beta); 

 lines.add("topicNum = " + K); 

 lines.add("docNum = " + M); 

 lines.add("termNum = " + V); 

 lines.add("iterations = " + iterations); 

 lines.add("saveStep = " + saveStep); 

 lines.add("beginSaveIters = " + beginSaveIters); 

 FileUtil.writeLines(resultPath + modelName + ".params", lines); 

 //lda.phi K*V 

 BufferedWriter writer = new BufferedWriter(new FileWriter(resultPath + modelName + ".phi")); 

 for (int i = 0; i i++){ 

 for (int j = 0; j j++){ 

 writer.write(phi[i][j] + "/t"); 

 writer.write("/n"); 

 writer.close(); 

 //lda.theta M*K 

 writer = new BufferedWriter(new FileWriter(resultPath + modelName + ".theta")); 

 for(int i = 0; i i++){ 

 for(int j = 0; j j++){ 

 writer.write(theta[i][j] + "/t"); 

 writer.write("/n"); 

 writer.close(); 

 //lda.tassign 

 writer = new BufferedWriter(new FileWriter(resultPath + modelName + ".tassign")); 

 for(int m = 0; m m++){ 

 for(int n = 0; n doc[m].length; n++){ 

 writer.write(doc[m][n] + ":" + z[m][n] + "/t"); 

 writer.write("/n"); 

 writer.close(); 

 List Word appendwords = new ArrayList Word 

 //lda.twords phi[][] K*V 

 writer = new BufferedWriter(new FileWriter(resultPath + modelName + ".twords")); 

 int topNum = 10; //Find the top 20 topic words in each topic 

 for(int i = 0; i i++){ 

 List Integer tWordsIndexArray = new ArrayList Integer 

 for(int j = 0; j j++){ 

 tWordsIndexArray.add(new Integer(j)); 

 Collections.sort(tWordsIndexArray, new LdaModel.TwordsComparable(phi[i])); 

 writer.write("topic " + i + "/t:/t"); 

 for(int t = 0; t topNum; t++){ 

 writer.write(docSet.indexToTermMap.get(tWordsIndexArray.get(t)) + " " + phi[i][tWordsIndexArray.get(t)] + "/t"); 

 Word word = new Word(docSet.indexToTermMap.get(tWordsIndexArray.get(t))); 

 word.setWeight(phi[i][tWordsIndexArray.get(t)]); 

 appendwords.add(word); 

 writer.write("/n"); 

 writer.close(); 

 //lda.words 

 writer = new BufferedWriter(new FileWriter(resultPath + modelName + ".words")); 

 for(Word word : appendwords){ 

 if(word.getContent().trim().equals("")) 

 continue; 

 writer.write(word.getContent()+"/t"+word.getWeight()+"/n"); 

 writer.close(); 

 public class TwordsComparable implements Comparator Integer { 

 public double [] sortProb; // Store probability of each word in topic k 

 public TwordsComparable (double[] sortProb){ 

 this.sortProb = sortProb; 

 @Override 

 public int compare(Integer o1, Integer o2) { 

 // TODO Auto-generated method stub 

 //Sort topic word index according to the probability of each word in topic k 

 if(sortProb[o1] sortProb[o2]) return -1; 

 else if(sortProb[o1] sortProb[o2]) return 1; 

 else return 0; 

 public static void main(String[] args){ 

}

public class Documents { 

ArrayList Document docs; 

 Map String, Integer termToIndexMap; 

 ArrayList String indexToTermMap; 

 Map String,Integer termCountMap; 

 private static NLPIRUtil npr = new NLPIRUtil(); 

 private static Set String stopWordsSet = SegmentWordsResult.getStopWordsSet(); 

 private Map Word,Integer wordDocMap; 

 private Map Integer, LinkedList String docWordsList;//key:第i篇文档，value：单词列表，为了与lda模型中的doc[m][n]的索引对应 


 termCountMap = new HashMap String, Integer 

 this.wordDocMap = new HashMap Word, Integer 

 this.docWordsList = new HashMap Integer, LinkedList String (); 

public Map String, Integer getTermCountMap() { 

return termCountMap; 


public void setTermCountMap(Map String, Integer termCountMap) { 

this.termCountMap = termCountMap; 

 public Map Word, Integer getWordDocMap() { 

return wordDocMap; 


public void setWordDocMap(Map Word, Integer wordDocMap) { 

this.wordDocMap = wordDocMap; 


public void setDocWordsList(Map Integer, LinkedList String docWordsList) { 

this.docWordsList = docWordsList; 


 for(File docFile : new File(docsPath).listFiles()){ 

 Document doc = new Document(docFile.getAbsolutePath(), termToIndexMap, indexToTermMap, termCountMap); 

 docs.add(doc); 

 for(Word word : doc.getWordMap().keySet()){ 

 if(this.wordDocMap.containsKey(word)) 

 this.wordDocMap.put(word, this.wordDocMap.get(word)); 

 else 

 this.wordDocMap.put(word, 1); 

 this.docWordsList.put(index++, doc.getWordsList()); 


 private static NLPIRUtil npr = new NLPIRUtil(); 

 private static Set String stopWordsSet = SegmentWordsResult.getStopWordsSet(); 

 private String docName; 

 int[] docWords; 

 private int wordCount; 

 private Map Word, Integer wordMap ; 

 private LinkedList String wordsList;//为了和docWords的索引对应，即单词内容对应索引值 

 public int getWordCount() { 

 return wordCount; 

 public void setWordCount(int wordCount) { 

 this.wordCount = wordCount; 

 public Map Word, Integer getWordMap() { 

 return wordMap; 

 public void setWordMap(Map Word, Integer wordMap) { 

 this.wordMap = wordMap; 

 public LinkedList String getWordsList() { 

 return wordsList; 

 public void setWordsList(LinkedList String wordsList) { 

 this.wordsList = wordsList; 

 public Document(String docContent){ 

 this.wordMap = new HashMap Word, Integer 

 this.wordsList = new LinkedList String 

 String splitResult = npr.NLPIR_ParagraphProcess(ProcessMessage.dealWithSentence(docContent), 0); 

 String[] wordsArray = splitResult.split(" "); 

 this.docWords = new int[wordsArray.length]; 

 int index = 0; 

 //Transfer word to index 

 for(String str : wordsArray){ 

 String content = ProcessMessage.dealSpecialString(str); 

 Word word = new Word(content); 

 if(ProcessKeyWords.remove(word) || stopWordsSet.contains(content)) 

 continue; 

 else if(content.length() = 1 || RegexMatch.specialMatch(content)) 

 continue; 

 this.wordCount ++; 

 if(!wordMap.containsKey(content)){ 

 int newIndex = wordMap.size(); 

 wordMap.put(word, 1); 

 docWords[index++] = newIndex; 

 }else{ 

 wordMap.put(word, wordMap.get(word)+1); 

 docWords[index++] = wordMap.get(content); 

 this.wordsList.add(content); 

 public Document(String filePath,Map String, Integer termToIndexMap, ArrayList String indexToTermMap, Map String, Integer termCountMap){ 

 this(FileUtil.readContent(filePath)); 

 this.docName = filePath; 

 this.wordMap = new HashMap Word, Integer 

 this.wordsList = new LinkedList String 

 //Read file and initialize word index array 

 String docContent = FileUtil.readContent(docName); 

 String splitResult = npr.NLPIR_ParagraphProcess(docContent, 0); 

 String[] wordsArray = splitResult.split(" "); 

 this.docWords = new int[wordsArray.length]; 

 int index = 0; 

 //Transfer word to index 

 for(String str : wordsArray){ 

 String content = ProcessMessage.dealSpecialString(str); 

 Word word = new Word(content); 

 if(ProcessKeyWords.remove(word) || stopWordsSet.contains(content)) 

 continue; 

 else if(ProcessKeyWords.isMeaninglessWord(content)) 

 continue; 

 this.wordCount ++; 

 if(!termToIndexMap.containsKey(content)){ 

 int newIndex = termToIndexMap.size(); 

 termToIndexMap.put(str, newIndex); 

 indexToTermMap.add(str); 

 termCountMap.put(str, new Integer(1)); 

 docWords[index++] = newIndex; 

 }else{ 

 termCountMap.put(content, termCountMap.get(content) + 1); 

 docWords[index++] = termToIndexMap.get(content); 

 this.wordsList.add(content); 

 if(wordMap.containsKey(word)) 

 wordMap.put(word, wordMap.get(word)+1); 

 else 

 wordMap.put(word, 1); 

 public boolean isNoiseWord(String string) { 

 // TODO Auto-generated method stub 

 string = string.toLowerCase().trim(); 

 Pattern MY_PATTERN = Pattern.compile(".*[a-zA-Z]+.*"); 

 Matcher m = MY_PATTERN.matcher(string); 

 // filter @xxx and URL 

 if(string.matches(".*www//..*") || string.matches(".*//.com.*") || 

 string.matches(".*http:.*") ) 

 return true; 

 else 

 return false; 

}

上述中的LdaModel中包含了预测新样本的方法predictNewSampleTopic，返回的是该样本的最大概率主题索引，LdaGibbsSampling中是训练LDA主题模型的流程 

主题-单词分布的部分结果如下：

topic 0 : ⒐ 0.0029859442729502916 住宅 0.002257665153592825制造 0.002257665153592825 行为 0.002257665153592825收益 0.0015293860342353582 西北 0.0015293860342353582红星 0.0015293860342353582 轻松 0.0015293860342353582小商品 0.0015293860342353582 搜房网 0.0015293860342353582

topic 1
:
贵宾 0.0030435749795287848
商城 0.0023012396413832903
太平洋保险 0.0015589043032377958
建设 0.0015589043032377958
储蓄 0.0015589043032377958
周四 0.0015589043032377958
完成 0.0015589043032377958
区内 0.0015589043032377958
王志钢 0.0015589043032377958
872944 0.0015589043032377958

topic 2
:
油田 0.0017282527405768633
雀巢 0.0017282527405768633
金千 0.0017282527405768633
山腰 9.052753448486328E-4

代办 9.052753448486328E-4
洋房 9.052753448486328E-4
月饼 9.052753448486328E-4
三星 9.052753448486328E-4
集成 9.052753448486328E-4
大桥 9.052753448486328E-4

topic 3
:
美容 0.0016053818399086595
疯狂 0.0016053818399086595
获取 0.0016053818399086595
名牌 0.0016053818399086595
风神 0.0016053818399086595
小额 0.0016053818399086595
璀璨 0.0016053818399086595
一千 0.0016053818399086595
专注 0.0016053818399086595
发放 0.0016053818399086595

topic 4
:
焦点 0.002957939635962248
搜狐 0.002236490836367011

房屋 0.002236490836367011
玉兰 0.002236490836367011
短期 0.002236490836367011
理疗 0.002236490836367011
4001080000 0.0015150421531870961
命题 0.0015150421531870961
公开 0.0015150421531870961
乐器 0.0015150421531870961

topic 5
:
实验 0.0023698494769632816
每块 0.0023698494769632816
收费 0.0023698494769632816
博览 0.0016053818399086595
重新 0.0016053818399086595
任意 0.0016053818399086595
借款 0.0016053818399086595
保底 0.0016053818399086595
预期 0.0016053818399086595
初二 0.0016053818399086595

topic 6
:
宗旨 0.0016625761054456234
陈勇军 0.0016625761054456234
拨打 0.0016625761054456234
家人 0.0016625761054456234
工业 0.0016625761054456234
百货店 0.0016625761054456234
实业 0.0016625761054456234
6222024000068818521 0.0016625761054456234
18692297994 0.0016625761054456234
13300 0.0016625761054456234

topic 7
:
→ 0.005167018622159958
餐厅 0.00298377126455307
保修 0.00298377126455307
英语 0.0022560220677405596

红 0.0022560220677405596
普通 0.0022560220677405596
学习 0.001528272987343371
龙湖 0.001528272987343371
电大 0.001528272987343371
任意 0.001528272987343371

topic 8
:
登陆 0.0025078877806663513
食宿 0.001698891632258892
急需 0.001698891632258892
建行 0.001698891632258892
葡萄酒 0.001698891632258892
新版 0.001698891632258892
富豪 0.001698891632258892
对比 0.001698891632258892
泥工 0.001698891632258892
相信 8.898956584744155E-4

topic 9
:
体育 0.7940398454666138
活动 0.005577780772000551
优惠 0.0038460372015833855
欢迎 0.003806901630014181
银行 0.0032981408294290304
电话 0.003268789267167449
联系 0.0031611667945981026
公司 0.002769812010228634
地址 0.0024860799312591553
】 0.002339322119951248

topic 10
:
年级 0.0023899467196315527

车主 0.0023899467196315527
过程 0.0016189961461350322
华联 0.0016189961461350322
家电 0.0016189961461350322
大业 0.0016189961461350322
时代 0.0016189961461350322
迪赛尼斯 0.0016189961461350322
稀缺 0.0016189961461350322
稳定 0.0016189961461350322

topic 11
:
利率 0.002570267766714096
知名 0.002570267766714096
南湖 0.0017411491135135293
实现 0.0017411491135135293
立秋 0.0017411491135135293
就读 0.0017411491135135293
罗马 0.0017411491135135293
广电局 0.0017411491135135293
独具 0.0017411491135135293
静候 0.0017411491135135293

topic 12
:
哥哥 0.0029536776710301638
家里 0.0029536776710301638
化妆 0.0029536776710301638
名品 0.0022332684602588415

一 0.0022332684602588415
四川 0.0015128592494875193
二手车 0.0015128592494875193
订购 0.0015128592494875193
多种 0.0015128592494875193
潜力 0.0015128592494875193

topic 13
:
建行 0.002435001078993082
开发商 0.0016495168674737215
美容 0.0016495168674737215
奔驰 0.0016495168674737215
比例 0.0016495168674737215
英伦 0.0016495168674737215
开通 0.0016495168674737215
开班 0.0016495168674737215
打开 0.0016495168674737215
英国 0.0016495168674737215

topic 14
:
增值 0.002355444012209773
[验] 0.002355444012209773
公开 0.0015956234419718385
打印机 0.0015956234419718385
家中 0.0015956234419718385
宾馆 0.0015956234419718385
12000 0.0015956234419718385
渠道 0.0015956234419718385
租赁 0.0015956234419718385
无效 0.0015956234419718385

topic 15
:
自由 0.0024857670068740845

巴拉巴 0.0024857670068740845

丰 0.0024857670068740845
朝阳 0.001683906652033329
家人 0.001683906652033329
84725588 0.001683906652033329
老弟 0.001683906652033329
商住 0.001683906652033329
县委 0.001683906652033329
德国 8.820463554002345E-4

topic 16
:
￥10亿 0.002975110663101077
楼下 0.002249473938718438
感恩 0.002249473938718438
独栋 0.002249473938718438
前来 0.0015238370979204774
手机 0.0015238370979204774
申请 0.0015238370979204774

乐 0.0015238370979204774
考点 0.0015238370979204774
3008300 0.0015238370979204774

topic 17
:
批发 0.00239548715762794
总监 0.0016227493761107326
车子 0.0016227493761107326
饭店 0.0016227493761107326
伙伴 0.0016227493761107326
直属 0.0016227493761107326
事后 0.0016227493761107326
翰林 0.0016227493761107326
专题片 0.0016227493761107326
装修 8.500116528011858E-4

topic 18
:
期待 0.0024758405052125454

价 0.0016771822702139616
你好 0.0016771822702139616
决定 0.0016771822702139616
助剂 0.0016771822702139616
人员 0.0016771822702139616
雄伟 0.0016771822702139616
只用 0.0016771822702139616
享受 8.785240934230387E-4
四川 8.785240934230387E-4

topic 19
:
房价 0.003103474387899041
底价 0.0023465293925255537
湖南 0.0015895843971520662

凡 0.0015895843971520662
送礼 0.0015895843971520662
恒大 0.0015895843971520662
一生 0.0015895843971520662
代言人 0.0015895843971520662
专车 0.0015895843971520662
大唐 0.0015895843971520662

topic 20
:
企业主 0.0023483068216592073
讲师 0.0023483068216592073

6222021001055293358 0.0023483068216592073
首发 0.0015907884808257222
认购 0.0015907884808257222
请问 0.0015907884808257222
发布 0.0015907884808257222
中午 0.0015907884808257222
开幕 0.0015907884808257222
⒍ 0.0015907884808257222

topic 21
:
重新 0.002323663793504238
帮忙 0.002323663793504238
85654475 0.002323663793504238

宾 0.002323663793504238

中国 0.0015740948729217052
学历 0.0015740948729217052
＂ 0.0015740948729217052
温州 0.0015740948729217052
好久 0.0015740948729217052
钢板 0.0015740948729217052

topic 22
:
可口 0.0024103878531605005
形象 0.0024103878531605005
减轻 0.0024103878531605005
高层 0.0016328433994203806
爸爸 0.0016328433994203806
基金 0.0016328433994203806
营业额 0.0016328433994203806
意大利 0.0016328433994203806
正常 0.0016328433994203806
吉智 0.0016328433994203806

topic 23
:
关系 0.0024738647043704987
经营 0.0016758438432589173
美容 0.0016758438432589173
梦想 0.0016758438432589173
喷漆 0.0016758438432589173
肌肤 0.0016758438432589173
刘汉琳 0.0016758438432589173
索菲 0.0016758438432589173
依依 0.0016758438432589173
欢迎 8.778230403549969E-4

topic 24
:
考试 0.0016652129124850035
上班 0.0016652129124850035
金条 0.0016652129124850035

宝 0.0016652129124850035
澳门 0.0016652129124850035
粘贴 0.0016652129124850035
收缩 0.0016652129124850035
18800574923 0.0016652129124850035
豪华 8.722544298507273E-4
老师 8.722544298507273E-4

topic 25
:
长期 0.0030594731215387583
开发区 0.0023132602218538523
低价 0.0023132602218538523
⑥ 0.0023132602218538523
转告 0.0023132602218538523

新 0.0015670472057536244
得到 0.0015670472057536244
[通] 0.0015670472057536244
融资 0.0015670472057536244
万科 0.0015670472057536244

topic 26
:
开发区 0.002339445985853672
石油 0.0015847859904170036
宁波 0.0015847859904170036
更换 0.0015847859904170036
不用 0.0015847859904170036
会议 0.0015847859904170036
初三 0.0015847859904170036
汽车站 0.0015847859904170036
抽空 0.0015847859904170036
实用 0.0015847859904170036

topic 27
:
代办 0.0016745076281949878
代表 0.0016745076281949878
女性 0.0016745076281949878
13825139678 0.0016745076281949878
承担 0.0016745076281949878
影响力 0.0016745076281949878
13934141989 0.0016745076281949878
槐花 0.0016745076281949878

沐 0.0016745076281949878
过敏 0.0016745076281949878

topic 28
:
婚礼 0.00862991251051426
海尔 0.002210969338193536
电影 0.002210969338193536
小乔 0.002210969338193536
15953174009 0.002210969338193536
茶店 0.002210969338193536
7627292. 0.002210969338193536
15985917304 0.002210969338193536
新余 0.001497753313742578
资料 0.001497753313742578

topic 29
:
【 0.021667908877134323

你 0.015670640394091606
您好 0.01555958017706871
光临 0.014560035429894924

尊敬 0.014337914064526558
现在 0.013005186803638935
】 0.012338823638856411
享受 0.010783976875245571
信用 0.009451250545680523
详情 0.007896402850747108

topic 30
:
西吉 0.0024778195656836033
封顶 0.0016785229090601206
押金 0.0016785229090601206
海外 0.0016785229090601206
澜庭 0.0016785229090601206
账户 0.0016785229090601206
原因 0.0016785229090601206

6222021001036927348 0.0016785229090601206
欧莱雅 0.0016785229090601206
推荐 8.792263106442988E-4

pre name= code >

原创文章，作者：ItWorker，如若转载，请注明出处：https://blog.ytso.com/9510.html

分布式文件系统，分布式数据库区块链并行处理（MPP）数据库，数据挖掘开源大数据平台数据中台数据分析数据开发数据治理数据湖数据采集

猜你喜欢

探索Redis默认安装地址（redis默认安装地址）
重新开始：清空Redis缓存（清空redis）
管理优化Oracle缓存管理提升数据库性能（oracle缓存）
一个优秀的企业网站应该具备的五个特点
如何让你的Lightbox支持滚轮缩放及Base64图片
一月MySQL：增加一个月的时间间隔（mysql当前时间加）
MySQL JDBC StreamResult通信原理浅析
1411年，MySQL的诞生（1411mysql）
Oracle改变记账准则，踏上新的征程（oracle不同记账准则）
仿淘宝收货地址，本地数据库详解手机开发
Linux D文件：终极保护神器（linuxd是什么文件）
ORA-19239: XPTY0019 – It is a type error if the result of an step (other than the last step) in a path expression contains an atomic value ORACLE 报错故障修复远程处理
AMD芯片支持搭载MySQL数据库环境（amd支持MySQL吗）
Redis重启编号重复的惊魂之夜（redis重启后编号重复）
Oracle书籍下载抓住学习机会（oracle书本下载）
ps软件绿色版 Adobe Photoshop 图片处理电脑用ps软件最新版本

zl程序教程

当前栏目

LDA主题模型的java代码实现详解大数据

相关文章