Tuesday, March 13, 2012

Shingles algorithm

A Few years ago I wrote my "AI" web crawler,  for gathering news articles all over the world in real time. Yeas, that was great adventure! And one of the problem, that I've impacted was- almost 30% of text was copied each other and  little bit edited, but the main  goal remain are the same.
Search for fuzzy duplicates allows to assume there are two objects if they same part or not. The object can be interpreted  as text files or any other data types. We will work with the text, but  once you realizing how the algorithm work-  will not be difficult to move my implementation on the objects you need.


Let  consider the  example of the text. Let us assume we have a text file in the 10th paragraph. Make a complete copy, and then rewrite only the last paragraph. 9 of 10 paragraphs of the second file is a complete copy of the original.

Another example is more complicated. If we rewrite the copy of the original text of each 5-6th of the sentences, the text will still be almost the same

How does Shingles algorithm works?
So, we have two of text and we need to assume if they are almost duplicates. The implementation of the algorithm involves the following steps:


  • canonization of texts;
  • partition of the text on the shingle;
  • finding a checksum;
  • find similar subsequences.
Now more specific. The algorithm is implemented shingles comparing checksums texts. In its implementation, I use MD5, but it applies, and others, such as SHA1 or  CRC32, etc.

Here is my code in C#:

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.IO;
using System.Security.Cryptography;

namespace dataAnalyze.Algorithms
{
    public class Shingles
    {
        
        private char[] _stopSymbols = null;
        public Shingles(string stopSymbolFilePath)
        {
            if (!File.Exists(stopSymbolFilePath))
                throw new Exception("File with stop symbols not exists: " + stopSymbolFilePath);
            this._stopSymbols = File.ReadAllText(stopSymbolFilePath).ToCharArray();

        }


        public double CompareStrings(string s1, string s2)
        {
            if (s1.Length <= 50) return 0.0;
            if (s2.Length <= 50) return 0.0;
            if (s1.Length > s2.Length)
            {
                if (Math.Abs((s1.Length / (double)s2.Length)) >= 1.7)
                    return 0.0;
            }
            else
            {
                if (Math.Abs((s2.Length / (double)s2.Length)) >= 1.7)
                    return 0.0;
            }


            RemoveStopSymbols(ref s1);
            RemoveStopSymbols(ref s2);

            if (s1.Length <= 50) return 0.0;
            if (s2.Length <= 50) return 0.0;

            string[] shingles1 = getShingles(ref s1, 50);
            string[] shingles2 = getShingles(ref s2, 50);
            int same = 0;
            for (int i = 0; i < shingles1.Length; i++)
            {
                if (shingles2.Contains(shingles1[i]))
                    same++;
            }

            return same * 2 / ((double)(shingles1.Length + shingles2.Length)) * 100;
        }





        public double CompareStringsCashed(ref string[] shingles1, ref string[] shingles2, string s1, string s2)
        {
            if (s1 != null && s2 != null)
            {
                if (s1.Length > s2.Length)
                {
                    if (Math.Abs((s1.Length / (double)s2.Length)) >= 1.7)
                        return 0.0;
                }
                else
                {
                    if (Math.Abs((s2.Length / (double)s1.Length)) >= 1.7)
                        return 0.0;
                }
            }

            if (s1 != null)
            {
                if (s1.Length <= 50) return 0.0;
                string inS1 = s1;
                RemoveStopSymbols(ref inS1);
                if (inS1.Length <= 50) return 0.0;
                shingles1 = getShingles(ref inS1, 50);
            }

            if (s2 != null)
            {
                if (s2.Length <= 50) return 0.0;
                string inS2 = s2;
                RemoveStopSymbols(ref inS2);
                if (inS2.Length <= 50) return 0.0;
                shingles2 = getShingles(ref inS2, 50);
            }                        
           
            int same = 0;
            for (int i = 0; i < shingles1.Length; i++)
            {
                if (shingles2.Contains(shingles1[i]))
                    same++;
            }

            return same * 2 / ((double)(shingles1.Length + shingles2.Length)) * 100;
          
        }


        /// <summary>
        /// get shingles and calculate hash for everyone
        /// </summary>
        /// <param name="source"></param>
        /// <param name="shingleLenght"></param>
        /// <returns></returns>
        private string[] getShingles(ref string source, int shingleLenght)
        {
            string[] shingles = new string[source.Length - (shingleLenght - 1)];
            int shift = 0;
            for (int i = 0; i < shingles.Length; i++)
            {

                shingles[i] = CalculateMD5Hash(
                    (source.Length >= shift + shingleLenght ? source.Substring(shift, shingleLenght) : source.Substring(shift, source.Length - (shift + shingleLenght)))
                    );
                shift++;
            }

            return shingles;

        }

        /// <summary>
        /// delete some inappropriate chars from the string
        /// </summary>
        /// <param name="source"></param>
        private void RemoveStopSymbols(ref string source)
        {
            int[] positionForRemove = new int[source.Length];
            int arrayCounter = 0;
            FindIndexOfSymbols(ref source, ref positionForRemove, ref arrayCounter, ref this._stopSymbols);
            Array.Resize(ref positionForRemove, arrayCounter);
            Array.Sort(positionForRemove);
            //Array.Reverse(positionForRemove);
            int shift = 0;
            StringBuilder result = new StringBuilder(source.Length - arrayCounter);
            for (int i = 0; i < source.Length; i++)
            {
                if (i == positionForRemove[shift])
                {
                    if (positionForRemove.Length > shift + 1)
                        shift++;
                }
                else
                    result.Append(source[i]);
            }

            //positionForRemove = null;
            source = result.ToString();

        }




        /// <summary>
        /// 
        /// </summary>
        /// <param name="source"> link for original string</param>
        /// <param name="positionsForRemove">array of indexes</param>
        /// <param name="arrayCounter">point to next element in array</param>
        /// <param name="symbols"></param>
        private void FindIndexOfSymbols(ref string source, ref int[] positionsForRemove, ref int arrayCounter, ref char[] symbols)
        {
            for (int i = 0; i < source.Length; i++)
            {
                for (int j = 0; j < symbols.Length; j++)
                    if (source[i] == symbols[j])
                    {
                        positionsForRemove[arrayCounter] = i;
                        arrayCounter++;

                    }
            }
        }





        public string CalculateMD5Hash(string input)
        {
            // step 1, calculate MD5 hash from input
            MD5 md5 = System.Security.Cryptography.MD5.Create();
            byte[] inputBytes = System.Text.Encoding.ASCII.GetBytes(input);
            byte[] hash = md5.ComputeHash(inputBytes);

            // step 2, convert byte array to hex string
            StringBuilder sb = new StringBuilder();
            for (int i = 0; i < hash.Length; i++)
            {
                sb.Append(hash[i].ToString("X2"));
            }
            return sb.ToString();
        }

    }
}



Unit tests


Do you use Unit tests on the regular base?
You can ask me - what is this? what exactly a unit tests?
I believe a lots of us  too lazy for using Unit tests, and that's true.  I would be willing to bet that only about a few  really do.
You see, the main problem  here is that when most developers are asked to provide any definition of unit testing they will say something like “tests that are cover every unit in  your application” or something like that. Yes, that's partially true.  Let me provide you some info regarding Unit Tests.
Unit test  it's s a test which verifies a “unit”, or the smallest piece of an application which is able to be tested.  Unit testing is all about testing small pieces in isolation.

Why should we use unit tests?

Unit tests find problems early in the development cycle.
Then  earlier problems are found, then cheaper it is to fix them.

The development process becomes more flexible
Sometimes it may be required to fix a problem and to deploy the new fix very quick. Despite efforts, a bug may hides in and an important feature may out of order.  Releasing quick fixes makes us feel uneasy because we are not certain what side-effects the changes might have. Running the unit tests with the fixes applied saves the day as they should reveal undesirable side-effects.

Unit tests are fast.
It's all  about point above.  after any changes you will definitely know  current  status of your source code, just tun unit test that you have wrote before.

At the end of the end, I would like to show you  how do I test QuickSort Method, that I've wrote recently.
I'll use NUnit framework  for  this purpose.


 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
namespace TestLib
{
    [TestFixture]
    public class TestQuickSort
    {
        [Test, TestCaseSource("Test_Success_Cases")]
        public void SortTest(int[] A)
        {
            bool verifyForNull = (A == null ? true : false);

            SortLib.QuickSort<int> sort = new QuickSort<int>();
            sort.Sort(A);
            if (!verifyForNull && A == null)
                Assert.Fail("Sorted Array is null, but should not be");
            if (verifyForNull)
            {
                Assert.IsNull(A);
                return;
            }

            int tmp = int.MinValue;
            for (int i = 0; i < A.Length; i++)
                if (tmp > A[i])
                    Assert.Fail("Array not sorted properly");
                else
                {
                    tmp = A[i];
                }

            Assert.Pass("Sorted properly");



        }



        static int[] GenArray(int size, int maxVal)
        {
            int[] result = new int[size];
            Random rnd = new Random();
            for (int i = 0; i < size; i++)
            {
                result[i] = rnd.Next(0, Math.Abs(maxVal));
                if (maxVal < 0)
                    result[i] *= -1;
            }
            return result;
        }

        static object[] Test_Success_Cases =
                     {
                         null,
                         new int[0],
                            GenArray(10,-11),                          
                            GenArray(10,0),                          
                            GenArray(1,1), 
                            GenArray(10,11), 
                            GenArray(100,101), 
                            GenArray(1000,1001), 
                            GenArray(1000000,10000001)
                           
                     };
    }
}

Yes, I know this not too deep dive description of Unit test, but I'll add some thing more additional later.

Monday, March 12, 2012

Image Processing

I few weeks ago  I was  played with  image  processing.  Actually I  have a dream, I want  write super smart  application,  for image processing. But today, I'll talk about very simple   image modifications.
As I said, I few weeks  ago  I got  tons of   images,  that  I have to  update them with some sort of template.
So I wrote small Lib  and Console application for this purpose.
Result:

Original image:

Here is Image Processing Lib source code:
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
 public class EditImage
    {
        public Image cropImage(Image img, Rectangle cropArea)
        {
            Bitmap bmpImage = new Bitmap(img);
            Bitmap bmpCrop = bmpImage.Clone(cropArea,
                                            bmpImage.PixelFormat);
            return (Image)(bmpCrop);
        }

        public Image resizeImage(Image imgToResize, Size size)
        {
            int sourceWidth = imgToResize.Width;
            int sourceHeight = imgToResize.Height;

            float nPercent = 0;
            float nPercentW = 0;
            float nPercentH = 0;

            nPercentW = ((float)size.Width / (float)sourceWidth);
            nPercentH = ((float)size.Height / (float)sourceHeight);

            if (nPercentH < nPercentW)
                nPercent = nPercentH;
            else
                nPercent = nPercentW;

            int destWidth = (int)(sourceWidth * nPercent);
            int destHeight = (int)(sourceHeight * nPercent);

            Bitmap b = new Bitmap(destWidth, destHeight);
            Graphics g = Graphics.FromImage((Image)b);
            g.InterpolationMode = InterpolationMode.HighQualityBicubic;

            g.DrawImage(imgToResize, 0, 0, destWidth, destHeight);
            g.Dispose();

            return (Image)b;
        }

        public void saveJpeg(string path, Bitmap img, long quality)
        {
            // Encoder parameter for image quality
            EncoderParameter qualityParam =
                new EncoderParameter(System.Drawing.Imaging.Encoder.Quality, quality);

            // Jpeg image codec
            ImageCodecInfo jpegCodec = getEncoderInfo("image/jpeg");

            if (jpegCodec == null)
                return;

            EncoderParameters encoderParams = new EncoderParameters(1);
            encoderParams.Param[0] = qualityParam;

            img.Save(path, jpegCodec, encoderParams);
        }

        public ImageCodecInfo getEncoderInfo(string mimeType)
        {
            // Get image codecs for all image formats
            ImageCodecInfo[] codecs = ImageCodecInfo.GetImageEncoders();

            // Find the correct image codec
            for (int i = 0; i < codecs.Length; i++)
                if (codecs[i].MimeType == mimeType)
                    return codecs[i];
            return null;
        }

        public Image RoundCorners(Image StartImage, int CornerRadius, Color BackgroundColor)
        {
            CornerRadius *= 2;
            Bitmap RoundedImage = new Bitmap(StartImage.Width, StartImage.Height);
            Graphics g = Graphics.FromImage(RoundedImage);
            g.Clear(BackgroundColor);
            g.SmoothingMode = SmoothingMode.AntiAlias;
            Brush brush = new TextureBrush(StartImage);
            GraphicsPath gp = new GraphicsPath();
            gp.AddArc(0, 0, CornerRadius, CornerRadius, 180, 90);
            gp.AddArc(0 + RoundedImage.Width - CornerRadius, 0, CornerRadius, CornerRadius, 270, 90);
            gp.AddArc(0 + RoundedImage.Width - CornerRadius, 0 + RoundedImage.Height - CornerRadius, CornerRadius, CornerRadius, 0, 90);
            gp.AddArc(0, 0 + RoundedImage.Height - CornerRadius, CornerRadius, CornerRadius, 90, 90);
            g.FillPath(brush, gp);
            return RoundedImage;
        }

    }

Here is Console Application with  Image processing invocation :
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
static void Main(string[] args)
        {
            string[] filePaths = Directory.GetFiles(@"C:\projects\ProjectX\ImageProcessing\testProcessing\bin\Debug\input\", "*.jpg");
            Image workedImage = null;
            Image template = null;
            ImageProcessing.EditImage editImage = new ImageProcessing.EditImage();
            for (int i = 0; i < filePaths.Length; i++)
            {
                try
                {
                    workedImage = Image.FromFile(filePaths[i]);//("nissan-370z-rc-1.jpg");
                    template = Image.FromFile("template_glass_orb.png");

                }
                catch (Exception ex)
                {
                    Console.WriteLine(ex.Message);
                    Console.ReadKey();
                }
                if (workedImage.Width < workedImage.Height || workedImage.Width < 300)
                {
                    workedImage.Dispose();
                    template.Dispose();
                    continue;
                }

                else if (workedImage.Width >= 300)
                {
                    workedImage = editImage.resizeImage(workedImage, new Size(300, 300));
                    workedImage = editImage.cropImage(workedImage, new Rectangle(0, 0, 200, workedImage.Height));
                    if (workedImage.Height > 200)
                    {
                        workedImage = editImage.cropImage(workedImage, new Rectangle(0, 0, workedImage.Width, 195));
                    }
                    workedImage = editImage.RoundCorners(workedImage, 100, Color.Transparent);

                    if (workedImage.Width < 200)
                    {
                        workedImage.Dispose();
                        template.Dispose();
                        Console.WriteLine("Oops, we are here");
                        Console.ReadKey();
                        continue;
                    }
                }



                using (var bitmap = new Bitmap(workedImage.Width, workedImage.Height))
                {
                    using (var canvas = Graphics.FromImage(bitmap))
                    {
                        canvas.InterpolationMode = InterpolationMode.HighQualityBicubic;
                        canvas.DrawImage(workedImage, new Rectangle(0, 0, workedImage.Width, workedImage.Height), new Rectangle(0, 0, workedImage.Width, workedImage.Height), GraphicsUnit.Pixel);

                        //modifiedImage = RoundCorners(disairedImage, 80, Color.Transparent);
                        canvas.DrawImage(template, 0, 0, template.Width, template.Height);
                        canvas.Save();
                    }
                    try
                    {
                        bitmap.Save(@"C:\projects\ProjectX\ImageProcessing\testProcessing\bin\Debug\input\new\" + Path.GetFileName(filePaths[i]), ImageFormat.Png);
                    }
                    catch (Exception ex)
                    {
                        Console.WriteLine(ex.Message);

                    }
                }
                workedImage.Dispose();
                template.Dispose();
            }
            Console.WriteLine("done");
            Console.ReadKey();
}

As you can see, it's  very easy to use.