zl程序教程

您现在的位置是:首页 >  后端

当前栏目

.NET下文本相似度算法余弦定理和SimHash浅析及应用实例分析

Net实例算法应用 分析 文本 浅析 相似
2023-06-13 09:15:39 时间

本文实例讲述了.NET下文本相似度算法余弦定理和SimHash浅析及应用。分享给大家供大家参考。具体分析如下:

余弦相似性

原理:首先我们先把两段文本分词,列出来所有单词,其次我们计算每个词语的词频,最后把词语转换为向量,这样我们就只需要计算两个向量的相似程度.
 
我们简单表述如下
 
文本1:我/爱/北京/天安门/经过分词求词频得出向量(伪向量) [1,1,1,1]
 
文本2:我们/都爱/北京/天安门/经过分词求词频得出向量(伪向量) [1,0,1,2]
 
我们可以把它们想象成空间中的两条线段,都是从原点([0,0,...])出发,指向不同的方向。两条线段之间形成一个夹角,如果夹角为0度,意味着方向相同、线段重合;如果夹角为90度,意味着形成直角,方向完全不相似;如果夹角为180度,意味着方向正好相反。因此,我们可以通过夹角的大小,来判断向量的相似程度。夹角越小,就代表越相似。
 
C#核心算法:

复制代码代码如下:
   publicclassTFIDFMeasure
   {
       privatestring[]_docs;
       privatestring[][]_ngramDoc;
       privateint_numDocs=0;
       privateint_numTerms=0;
       privateArrayList_terms;
       privateint[][]_termFreq;
       privatefloat[][]_termWeight;
       privateint[]_maxTermFreq;
       privateint[]_docFreq;
 
       publicclassTermVector
       {       
           publicstaticfloatComputeCosineSimilarity(float[]vector1,float[]vector2)
           {
               if(vector1.Length!=vector2.Length)               
                   thrownewException("DIFERLENGTH");
               
 
               floatdenom=(VectorLength(vector1)*VectorLength(vector2));
               if(denom==0F)               
                   return0F;               
               else               
                   return(InnerProduct(vector1,vector2)/denom);
               
           }
 
           publicstaticfloatInnerProduct(float[]vector1,float[]vector2)
           {
           
               if(vector1.Length!=vector2.Length)
                   thrownewException("DIFFERLENGTHARENOTALLOWED");
               
           
               floatresult=0F;
               for(inti=0;i<vector1.Length;i++)               
                   result+=vector1[i]*vector2[i];
               
               returnresult;
           }
       
           publicstaticfloatVectorLength(float[]vector)
           {           
               floatsum=0.0F;
               for(inti=0;i<vector.Length;i++)               
                   sum=sum+(vector[i]*vector[i]);
                       
               return(float)Math.Sqrt(sum);
           }
       }
 
       privateIDictionary_wordsIndex=newHashtable();
 
       publicTFIDFMeasure(string[]documents)
       {
           _docs=documents;
           _numDocs=documents.Length;
           MyInit();
       }
 
       privatevoidGeneratNgramText()
       {
           
       }
 
       privateArrayListGenerateTerms(string[]docs)
       {
           ArrayListuniques=newArrayList();
           _ngramDoc=newstring[_numDocs][];
           for(inti=0;i<docs.Length;i++)
           {
               Tokenisertokenizer=newTokeniser();
               string[]words=tokenizer.Partition(docs[i]);           
 
               for(intj=0;j<words.Length;j++)
                   if(!uniques.Contains(words[j]))               
                       uniques.Add(words[j]);
           }
           returnuniques;
       }

       privatestaticobjectAddElement(IDictionarycollection,objectkey,objectnewValue)
       {
           objectelement=collection[key];
           collection[key]=newValue;
           returnelement;
       }
 
       privateintGetTermIndex(stringterm)
       {
           objectindex=_wordsIndex[term];
           if(index==null)return-1;
           return(int)index;
       }
 
       privatevoidMyInit()
       {
           _terms=GenerateTerms(_docs);
           _numTerms=_terms.Count;
 
           _maxTermFreq=newint[_numDocs];
           _docFreq=newint[_numTerms];
           _termFreq=newint[_numTerms][];
           _termWeight=newfloat[_numTerms][];
 
           for(inti=0;i<_terms.Count;i++)           
           {
               _termWeight[i]=newfloat[_numDocs];
               _termFreq[i]=newint[_numDocs];
 
               AddElement(_wordsIndex,_terms[i],i);           
           }
           
           GenerateTermFrequency();
           GenerateTermWeight();           
       }
       
       privatefloatLog(floatnum)
       {
           return(float)Math.Log(num);//log2
       }
 
       privatevoidGenerateTermFrequency()
       {
           for(inti=0;i<_numDocs ;i++)
           {                               
               stringcurDoc=_docs[i];
               IDictionaryfreq=GetWordFrequency(curDoc);
               IDictionaryEnumeratorenums=freq.GetEnumerator();
               _maxTermFreq[i]=int.MinValue;
               while(enums.MoveNext())
               {
                   stringword=(string)enums.Key;
                   intwordFreq=(int)enums.Value;
                   inttermIndex=GetTermIndex(word);
 
                   _termFreq[termIndex][i]=wordFreq;
                   _docFreq[termIndex]++;
 
                   if(wordFreq>_maxTermFreq[i])_maxTermFreq[i]=wordFreq;                   
               }
           }
       }

       privatevoidGenerateTermWeight()
       {           
           for(inti=0;i<_numTerms  ;i++)
           {
               for(intj=0;j<_numDocs;j++)               
                   _termWeight[i][j]=ComputeTermWeight(i,j);
           }
       }
 
       privatefloatGetTermFrequency(intterm,intdoc)
       {           
           intfreq=_termFreq[term][doc];
           intmaxfreq=_maxTermFreq[doc];           
           
           return((float)freq/(float)maxfreq);
       }
 
       privatefloatGetInverseDocumentFrequency(intterm)
       {
           intdf=_docFreq[term];
           returnLog((float)(_numDocs)/(float)df);
       }
 
       privatefloatComputeTermWeight(intterm,intdoc)
       {
           floattf=GetTermFrequency(term,doc);
           floatidf=GetInverseDocumentFrequency(term);
           returntf*idf;
       }
       
       private float[]GetTermVector(intdoc)
       {
           float[]w=newfloat[_numTerms];
           for(inti=0;i<_numTerms;i++)
               w[i]=_termWeight[i][doc];
           returnw;
       }
 
       publicfloatGetSimilarity(intdoc_i,intdoc_j)
       {
           float[]vector1=GetTermVector(doc_i);
           float[]vector2=GetTermVector(doc_j);
           returnTermVector.ComputeCosineSimilarity(vector1,vector2);
       }
       
       privateIDictionaryGetWordFrequency(stringinput)
       {
           stringconvertedInput=input.ToLower();
           Tokenisertokenizer=newTokeniser();
           String[]words=tokenizer.Partition(convertedInput);
           Array.Sort(words);
           
           String[]distinctWords=GetDistinctWords(words);
                       
           IDictionaryresult=newHashtable();
           for(inti=0;i<distinctWords.Length;i++)
           {
               objecttmp;
               tmp=CountWords(distinctWords[i],words);
               result[distinctWords[i]]=tmp;
           }
           returnresult;
       }               
               
       privatestring[]GetDistinctWords(String[]input)
       {               
           if(input==null)           
               returnnewstring[0];           
           else
           {
               ArrayListlist=newArrayList();
               
               for(inti=0;i<input.Length;i++)
                   if(!list.Contains(input[i]))//N-GRAMSIMILARITY?
                       list.Add(input[i]);
               returnTokeniser.ArrayListToArray(list);
           }
       }

       privateintCountWords(stringword,string[]words)
       {
           intitemIdx=Array.BinarySearch(words,word);
           
           if(itemIdx>0)           
               while(itemIdx>0&&words[itemIdx].Equals(word))
                   itemIdx--;               
           intcount=0;
           while(itemIdx<words.Length&&itemIdx>=0)
           {
               if(words[itemIdx].Equals(word))count++;
               itemIdx++;
               if(itemIdx<words.Length)               
                   if(!words[itemIdx].Equals(word))break;
           }
           returncount;
       }               
}


 
缺点:
 
由于有可能一个文章的特征向量词特别多导致整个向量维度很高,使得计算的代价太大不适合大数据量的计算。
 
SimHash原理:
 
算法的主要思想是降维,将高维的特征向量映射成一个f-bit的指纹(fingerprint),通过比较两篇文章的f-bit指纹的HammingDistance来确定文章是否重复或者高度近似。由于每篇文章我们都可以事先计算好HammingDistance来保存,到时候直接通过HammingDistance来计算,所以速度非常快适合大数据计算。
 
Google就是基于此算法实现网页文件查重的。我们假设有以下三段文本:
 
1,thecatsatonthemat
 
2,thecatsatonamat
 
3,weallscreamforicecream
 
如何实现这种hash算法呢?以上述三个文本为例,整个过程可以分为以下六步:
1、选择simhash的位数,请综合考虑存储成本以及数据集的大小,比如说32位
2、将simhash的各位初始化为0
3、提取原始文本中的特征,一般采用各种分词的方式。比如对于"thecatsatonthemat",采用两两分词的方式得到如下结果:{"th","he","e","c","ca","at","t","s","sa","o","on","n","t","m","ma"}
4、使用传统的32位hash函数计算各个word的hashcode,比如:"th".hash=-502157718
,"he".hash=-369049682,……
5、对各word的hashcode的每一位,如果该位为1,则simhash相应位的值加1;否则减1
6、对最后得到的32位的simhash,如果该位大于1,则设为1;否则设为0

希望本文所述对大家的.net程序设计有所帮助。