Tuesday, March 13, 2012

Shingles algorithm

A Few years ago I wrote my "AI" web crawler,  for gathering news articles all over the world in real time. Yeas, that was great adventure! And one of the problem, that I've impacted was- almost 30% of text was copied each other and  little bit edited, but the main  goal remain are the same.
Search for fuzzy duplicates allows to assume there are two objects if they same part or not. The object can be interpreted  as text files or any other data types. We will work with the text, but  once you realizing how the algorithm work-  will not be difficult to move my implementation on the objects you need.


Let  consider the  example of the text. Let us assume we have a text file in the 10th paragraph. Make a complete copy, and then rewrite only the last paragraph. 9 of 10 paragraphs of the second file is a complete copy of the original.

Another example is more complicated. If we rewrite the copy of the original text of each 5-6th of the sentences, the text will still be almost the same

How does Shingles algorithm works?
So, we have two of text and we need to assume if they are almost duplicates. The implementation of the algorithm involves the following steps:


  • canonization of texts;
  • partition of the text on the shingle;
  • finding a checksum;
  • find similar subsequences.
Now more specific. The algorithm is implemented shingles comparing checksums texts. In its implementation, I use MD5, but it applies, and others, such as SHA1 or  CRC32, etc.

Here is my code in C#:

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.IO;
using System.Security.Cryptography;

namespace dataAnalyze.Algorithms
{
    public class Shingles
    {
        
        private char[] _stopSymbols = null;
        public Shingles(string stopSymbolFilePath)
        {
            if (!File.Exists(stopSymbolFilePath))
                throw new Exception("File with stop symbols not exists: " + stopSymbolFilePath);
            this._stopSymbols = File.ReadAllText(stopSymbolFilePath).ToCharArray();

        }


        public double CompareStrings(string s1, string s2)
        {
            if (s1.Length <= 50) return 0.0;
            if (s2.Length <= 50) return 0.0;
            if (s1.Length > s2.Length)
            {
                if (Math.Abs((s1.Length / (double)s2.Length)) >= 1.7)
                    return 0.0;
            }
            else
            {
                if (Math.Abs((s2.Length / (double)s2.Length)) >= 1.7)
                    return 0.0;
            }


            RemoveStopSymbols(ref s1);
            RemoveStopSymbols(ref s2);

            if (s1.Length <= 50) return 0.0;
            if (s2.Length <= 50) return 0.0;

            string[] shingles1 = getShingles(ref s1, 50);
            string[] shingles2 = getShingles(ref s2, 50);
            int same = 0;
            for (int i = 0; i < shingles1.Length; i++)
            {
                if (shingles2.Contains(shingles1[i]))
                    same++;
            }

            return same * 2 / ((double)(shingles1.Length + shingles2.Length)) * 100;
        }





        public double CompareStringsCashed(ref string[] shingles1, ref string[] shingles2, string s1, string s2)
        {
            if (s1 != null && s2 != null)
            {
                if (s1.Length > s2.Length)
                {
                    if (Math.Abs((s1.Length / (double)s2.Length)) >= 1.7)
                        return 0.0;
                }
                else
                {
                    if (Math.Abs((s2.Length / (double)s1.Length)) >= 1.7)
                        return 0.0;
                }
            }

            if (s1 != null)
            {
                if (s1.Length <= 50) return 0.0;
                string inS1 = s1;
                RemoveStopSymbols(ref inS1);
                if (inS1.Length <= 50) return 0.0;
                shingles1 = getShingles(ref inS1, 50);
            }

            if (s2 != null)
            {
                if (s2.Length <= 50) return 0.0;
                string inS2 = s2;
                RemoveStopSymbols(ref inS2);
                if (inS2.Length <= 50) return 0.0;
                shingles2 = getShingles(ref inS2, 50);
            }                        
           
            int same = 0;
            for (int i = 0; i < shingles1.Length; i++)
            {
                if (shingles2.Contains(shingles1[i]))
                    same++;
            }

            return same * 2 / ((double)(shingles1.Length + shingles2.Length)) * 100;
          
        }


        /// <summary>
        /// get shingles and calculate hash for everyone
        /// </summary>
        /// <param name="source"></param>
        /// <param name="shingleLenght"></param>
        /// <returns></returns>
        private string[] getShingles(ref string source, int shingleLenght)
        {
            string[] shingles = new string[source.Length - (shingleLenght - 1)];
            int shift = 0;
            for (int i = 0; i < shingles.Length; i++)
            {

                shingles[i] = CalculateMD5Hash(
                    (source.Length >= shift + shingleLenght ? source.Substring(shift, shingleLenght) : source.Substring(shift, source.Length - (shift + shingleLenght)))
                    );
                shift++;
            }

            return shingles;

        }

        /// <summary>
        /// delete some inappropriate chars from the string
        /// </summary>
        /// <param name="source"></param>
        private void RemoveStopSymbols(ref string source)
        {
            int[] positionForRemove = new int[source.Length];
            int arrayCounter = 0;
            FindIndexOfSymbols(ref source, ref positionForRemove, ref arrayCounter, ref this._stopSymbols);
            Array.Resize(ref positionForRemove, arrayCounter);
            Array.Sort(positionForRemove);
            //Array.Reverse(positionForRemove);
            int shift = 0;
            StringBuilder result = new StringBuilder(source.Length - arrayCounter);
            for (int i = 0; i < source.Length; i++)
            {
                if (i == positionForRemove[shift])
                {
                    if (positionForRemove.Length > shift + 1)
                        shift++;
                }
                else
                    result.Append(source[i]);
            }

            //positionForRemove = null;
            source = result.ToString();

        }




        /// <summary>
        /// 
        /// </summary>
        /// <param name="source"> link for original string</param>
        /// <param name="positionsForRemove">array of indexes</param>
        /// <param name="arrayCounter">point to next element in array</param>
        /// <param name="symbols"></param>
        private void FindIndexOfSymbols(ref string source, ref int[] positionsForRemove, ref int arrayCounter, ref char[] symbols)
        {
            for (int i = 0; i < source.Length; i++)
            {
                for (int j = 0; j < symbols.Length; j++)
                    if (source[i] == symbols[j])
                    {
                        positionsForRemove[arrayCounter] = i;
                        arrayCounter++;

                    }
            }
        }





        public string CalculateMD5Hash(string input)
        {
            // step 1, calculate MD5 hash from input
            MD5 md5 = System.Security.Cryptography.MD5.Create();
            byte[] inputBytes = System.Text.Encoding.ASCII.GetBytes(input);
            byte[] hash = md5.ComputeHash(inputBytes);

            // step 2, convert byte array to hex string
            StringBuilder sb = new StringBuilder();
            for (int i = 0; i < hash.Length; i++)
            {
                sb.Append(hash[i].ToString("X2"));
            }
            return sb.ToString();
        }

    }
}



No comments:

Post a Comment