SimilarityIndex.java

  1. /*
  2.  * Copyright (C) 2010, Google Inc. and others
  3.  *
  4.  * This program and the accompanying materials are made available under the
  5.  * terms of the Eclipse Distribution License v. 1.0 which is available at
  6.  * https://www.eclipse.org/org/documents/edl-v10.php.
  7.  *
  8.  * SPDX-License-Identifier: BSD-3-Clause
  9.  */

  10. package org.eclipse.jgit.diff;

  11. import java.io.EOFException;
  12. import java.io.IOException;
  13. import java.io.InputStream;
  14. import java.util.Arrays;

  15. import org.eclipse.jgit.errors.MissingObjectException;
  16. import org.eclipse.jgit.lib.ObjectLoader;
  17. import org.eclipse.jgit.lib.ObjectStream;

  18. /**
  19.  * Index structure of lines/blocks in one file.
  20.  * <p>
  21.  * This structure can be used to compute an approximation of the similarity
  22.  * between two files. The index is used by
  23.  * {@link org.eclipse.jgit.diff.SimilarityRenameDetector} to compute scores
  24.  * between files.
  25.  * <p>
  26.  * To save space in memory, this index uses a space efficient encoding which
  27.  * will not exceed 1 MiB per instance. The index starts out at a smaller size
  28.  * (closer to 2 KiB), but may grow as more distinct blocks within the scanned
  29.  * file are discovered.
  30.  *
  31.  * @since 4.0
  32.  */
  33. public class SimilarityIndex {
  34.     /** A special {@link TableFullException} used in place of OutOfMemoryError. */
  35.     public static final TableFullException
  36.             TABLE_FULL_OUT_OF_MEMORY = new TableFullException();

  37.     /**
  38.      * Shift to apply before storing a key.
  39.      * <p>
  40.      * Within the 64 bit table record space, we leave the highest bit unset so
  41.      * all values are positive. The lower 32 bits to count bytes.
  42.      */
  43.     private static final int KEY_SHIFT = 32;

  44.     /** Maximum value of the count field, also mask to extract the count. */
  45.     private static final long MAX_COUNT = (1L << KEY_SHIFT) - 1;

  46.     /**
  47.      * Total amount of bytes hashed into the structure, including \n. This is
  48.      * usually the size of the file minus number of CRLF encounters.
  49.      */
  50.     private long hashedCnt;

  51.     /** Number of non-zero entries in {@link #idHash}. */
  52.     private int idSize;

  53.     /** {@link #idSize} that triggers {@link #idHash} to double in size. */
  54.     private int idGrowAt;

  55.     /**
  56.      * Pairings of content keys and counters.
  57.      * <p>
  58.      * Slots in the table are actually two ints wedged into a single long. The
  59.      * upper 32 bits stores the content key, and the remaining lower bits stores
  60.      * the number of bytes associated with that key. Empty slots are denoted by
  61.      * 0, which cannot occur because the count cannot be 0. Values can only be
  62.      * positive, which we enforce during key addition.
  63.      */
  64.     private long[] idHash;

  65.     /** {@code idHash.length == 1 << idHashBits}. */
  66.     private int idHashBits;

  67.     /**
  68.      * Create a new similarity index for the given object
  69.      *
  70.      * @param obj
  71.      *            the object to hash
  72.      * @return similarity index for this object
  73.      * @throws java.io.IOException
  74.      *             file contents cannot be read from the repository.
  75.      * @throws org.eclipse.jgit.diff.SimilarityIndex.TableFullException
  76.      *             object hashing overflowed the storage capacity of the
  77.      *             SimilarityIndex.
  78.      */
  79.     public static SimilarityIndex create(ObjectLoader obj) throws IOException,
  80.             TableFullException {
  81.         SimilarityIndex idx = new SimilarityIndex();
  82.         idx.hash(obj);
  83.         idx.sort();
  84.         return idx;
  85.     }

  86.     SimilarityIndex() {
  87.         idHashBits = 8;
  88.         idHash = new long[1 << idHashBits];
  89.         idGrowAt = growAt(idHashBits);
  90.     }

  91.     static boolean isBinary(ObjectLoader obj) throws IOException {
  92.         if (obj.isLarge()) {
  93.             try (ObjectStream in1 = obj.openStream()) {
  94.                 return RawText.isBinary(in1);
  95.             }
  96.         }
  97.         byte[] raw = obj.getCachedBytes();
  98.         return RawText.isBinary(raw, raw.length, true);
  99.     }

  100.     void hash(ObjectLoader obj) throws MissingObjectException, IOException,
  101.             TableFullException {
  102.         if (obj.isLarge()) {
  103.             hashLargeObject(obj);
  104.         } else {
  105.             byte[] raw = obj.getCachedBytes();
  106.             hash(raw, 0, raw.length);
  107.         }
  108.     }

  109.     private void hashLargeObject(ObjectLoader obj) throws IOException,
  110.             TableFullException {
  111.         boolean text;
  112.         text = !isBinary(obj);

  113.         try (ObjectStream in2 = obj.openStream()) {
  114.             hash(in2, in2.getSize(), text);
  115.         }
  116.     }

  117.     void hash(byte[] raw, int ptr, int end) throws TableFullException {
  118.         final boolean text = !RawText.isBinary(raw, raw.length, true);
  119.         hashedCnt = 0;
  120.         while (ptr < end) {
  121.             int hash = 5381;
  122.             int blockHashedCnt = 0;
  123.             int start = ptr;

  124.             // Hash one line, or one block, whichever occurs first.
  125.             do {
  126.                 int c = raw[ptr++] & 0xff;
  127.                 // Ignore CR in CRLF sequence if text
  128.                 if (text && c == '\r' && ptr < end && raw[ptr] == '\n')
  129.                     continue;
  130.                 blockHashedCnt++;
  131.                 if (c == '\n')
  132.                     break;
  133.                 hash = (hash << 5) + hash + c;
  134.             } while (ptr < end && ptr - start < 64);
  135.             hashedCnt += blockHashedCnt;
  136.             add(hash, blockHashedCnt);
  137.         }
  138.     }

  139.     void hash(InputStream in, long remaining, boolean text) throws IOException,
  140.             TableFullException {
  141.         byte[] buf = new byte[4096];
  142.         int ptr = 0;
  143.         int cnt = 0;

  144.         while (0 < remaining) {
  145.             int hash = 5381;
  146.             int blockHashedCnt = 0;

  147.             // Hash one line, or one block, whichever occurs first.
  148.             int n = 0;
  149.             do {
  150.                 if (ptr == cnt) {
  151.                     ptr = 0;
  152.                     cnt = in.read(buf, 0, buf.length);
  153.                     if (cnt <= 0)
  154.                         throw new EOFException();
  155.                 }

  156.                 n++;
  157.                 int c = buf[ptr++] & 0xff;
  158.                 // Ignore CR in CRLF sequence if text
  159.                 if (text && c == '\r' && ptr < cnt && buf[ptr] == '\n')
  160.                     continue;
  161.                 blockHashedCnt++;
  162.                 if (c == '\n')
  163.                     break;
  164.                 hash = (hash << 5) + hash + c;
  165.             } while (n < 64 && n < remaining);
  166.             hashedCnt += blockHashedCnt;
  167.             add(hash, blockHashedCnt);
  168.             remaining -= n;
  169.         }
  170.     }

  171.     /**
  172.      * Sort the internal table so it can be used for efficient scoring.
  173.      * <p>
  174.      * Once sorted, additional lines/blocks cannot be added to the index.
  175.      */
  176.     void sort() {
  177.         // Sort the array. All of the empty space will wind up at the front,
  178.         // because we forced all of the keys to always be positive. Later
  179.         // we only work with the back half of the array.
  180.         //
  181.         Arrays.sort(idHash);
  182.     }

  183.     /**
  184.      * Compute the similarity score between this index and another.
  185.      * <p>
  186.      * A region of a file is defined as a line in a text file or a fixed-size
  187.      * block in a binary file. To prepare an index, each region in the file is
  188.      * hashed; the values and counts of hashes are retained in a sorted table.
  189.      * Define the similarity fraction F as the count of matching regions
  190.      * between the two files divided between the maximum count of regions in
  191.      * either file. The similarity score is F multiplied by the maxScore
  192.      * constant, yielding a range [0, maxScore]. It is defined as maxScore for
  193.      * the degenerate case of two empty files.
  194.      * <p>
  195.      * The similarity score is symmetrical; i.e. a.score(b) == b.score(a).
  196.      *
  197.      * @param dst
  198.      *            the other index
  199.      * @param maxScore
  200.      *            the score representing a 100% match
  201.      * @return the similarity score
  202.      */
  203.     public int score(SimilarityIndex dst, int maxScore) {
  204.         long max = Math.max(hashedCnt, dst.hashedCnt);
  205.         if (max == 0)
  206.             return maxScore;
  207.         return (int) ((common(dst) * maxScore) / max);
  208.     }

  209.     long common(SimilarityIndex dst) {
  210.         return common(this, dst);
  211.     }

  212.     private static long common(SimilarityIndex src, SimilarityIndex dst) {
  213.         int srcIdx = src.packedIndex(0);
  214.         int dstIdx = dst.packedIndex(0);
  215.         long[] srcHash = src.idHash;
  216.         long[] dstHash = dst.idHash;
  217.         return common(srcHash, srcIdx, dstHash, dstIdx);
  218.     }

  219.     private static long common(long[] srcHash, int srcIdx, //
  220.             long[] dstHash, int dstIdx) {
  221.         if (srcIdx == srcHash.length || dstIdx == dstHash.length)
  222.             return 0;

  223.         long common = 0;
  224.         int srcKey = keyOf(srcHash[srcIdx]);
  225.         int dstKey = keyOf(dstHash[dstIdx]);

  226.         for (;;) {
  227.             if (srcKey == dstKey) {
  228.                 common += Math.min(countOf(srcHash[srcIdx]),
  229.                         countOf(dstHash[dstIdx]));

  230.                 if (++srcIdx == srcHash.length)
  231.                     break;
  232.                 srcKey = keyOf(srcHash[srcIdx]);

  233.                 if (++dstIdx == dstHash.length)
  234.                     break;
  235.                 dstKey = keyOf(dstHash[dstIdx]);

  236.             } else if (srcKey < dstKey) {
  237.                 // Regions of src which do not appear in dst.
  238.                 if (++srcIdx == srcHash.length)
  239.                     break;
  240.                 srcKey = keyOf(srcHash[srcIdx]);

  241.             } else /* if (dstKey < srcKey) */{
  242.                 // Regions of dst which do not appear in src.
  243.                 if (++dstIdx == dstHash.length)
  244.                     break;
  245.                 dstKey = keyOf(dstHash[dstIdx]);
  246.             }
  247.         }

  248.         return common;
  249.     }

  250.     // Testing only
  251.     int size() {
  252.         return idSize;
  253.     }

  254.     // Testing only
  255.     int key(int idx) {
  256.         return keyOf(idHash[packedIndex(idx)]);
  257.     }

  258.     // Testing only
  259.     long count(int idx) {
  260.         return countOf(idHash[packedIndex(idx)]);
  261.     }

  262.     // Brute force approach only for testing.
  263.     int findIndex(int key) {
  264.         for (int i = 0; i < idSize; i++)
  265.             if (key(i) == key)
  266.                 return i;
  267.         return -1;
  268.     }

  269.     private int packedIndex(int idx) {
  270.         return (idHash.length - idSize) + idx;
  271.     }

  272.     void add(int key, int cnt) throws TableFullException {
  273.         key = (key * 0x9e370001) >>> 1; // Mix bits and ensure not negative.

  274.         int j = slot(key);
  275.         for (;;) {
  276.             long v = idHash[j];
  277.             if (v == 0) {
  278.                 // Empty slot in the table, store here.
  279.                 if (idGrowAt <= idSize) {
  280.                     grow();
  281.                     j = slot(key);
  282.                     continue;
  283.                 }
  284.                 idHash[j] = pair(key, cnt);
  285.                 idSize++;
  286.                 return;

  287.             } else if (keyOf(v) == key) {
  288.                 // Same key, increment the counter. If it overflows, fail
  289.                 // indexing to prevent the key from being impacted.
  290.                 //
  291.                 idHash[j] = pair(key, countOf(v) + cnt);
  292.                 return;

  293.             } else if (++j >= idHash.length) {
  294.                 j = 0;
  295.             }
  296.         }
  297.     }

  298.     private static long pair(int key, long cnt) throws TableFullException {
  299.         if (MAX_COUNT < cnt)
  300.             throw new TableFullException();
  301.         return (((long) key) << KEY_SHIFT) | cnt;
  302.     }

  303.     private int slot(int key) {
  304.         // We use 31 - idHashBits because the upper bit was already forced
  305.         // to be 0 and we want the remaining high bits to be used as the
  306.         // table slot.
  307.         //
  308.         return key >>> (31 - idHashBits);
  309.     }

  310.     private static int growAt(int idHashBits) {
  311.         return (1 << idHashBits) * (idHashBits - 3) / idHashBits;
  312.     }

  313.     @SuppressWarnings("UnusedException")
  314.     private void grow() throws TableFullException {
  315.         if (idHashBits == 30)
  316.             throw new TableFullException();

  317.         long[] oldHash = idHash;
  318.         int oldSize = idHash.length;

  319.         idHashBits++;
  320.         idGrowAt = growAt(idHashBits);

  321.         try {
  322.             idHash = new long[1 << idHashBits];
  323.         } catch (OutOfMemoryError noMemory) {
  324.             throw TABLE_FULL_OUT_OF_MEMORY;
  325.         }

  326.         for (int i = 0; i < oldSize; i++) {
  327.             long v = oldHash[i];
  328.             if (v != 0) {
  329.                 int j = slot(keyOf(v));
  330.                 while (idHash[j] != 0)
  331.                     if (++j >= idHash.length)
  332.                         j = 0;
  333.                 idHash[j] = v;
  334.             }
  335.         }
  336.     }

  337.     private static int keyOf(long v) {
  338.         return (int) (v >>> KEY_SHIFT);
  339.     }

  340.     private static long countOf(long v) {
  341.         return v & MAX_COUNT;
  342.     }

  343.     /** Thrown by {@code create()} when file is too large. */
  344.     public static class TableFullException extends Exception {
  345.         private static final long serialVersionUID = 1L;
  346.     }
  347. }