Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@
import java.util.Arrays;
import java.util.Comparator;
import java.util.List;
import java.util.Objects;
import java.util.PriorityQueue;
import org.apache.hadoop.hbase.Cell;
import org.apache.hadoop.hbase.CellComparator;
Expand Down Expand Up @@ -106,39 +105,53 @@ public FuzzyRowFilter(List<Pair<byte[], byte[]>> fuzzyKeysData) {
p.setFirst(Arrays.copyOf(aFuzzyKeysData.getFirst(), aFuzzyKeysData.getFirst().length));
p.setSecond(Arrays.copyOf(aFuzzyKeysData.getSecond(), aFuzzyKeysData.getSecond().length));

// update mask ( 0 -> -1 (0xff), 1 -> 2)
// Normalize the mask, zero the non-fixed key bytes, then fix the unsafe mask to its final
// {-1, 0} form once here so it is never mutated during scanning.
p.setSecond(preprocessMask(p.getSecond()));
preprocessSearchKey(p);
preprocessSearchKey(p, UNSAFE_UNALIGNED);
preprocessMaskForSatisfies(p.getSecond(), UNSAFE_UNALIGNED);

fuzzyKeyDataCopy.add(p);
}
this.fuzzyKeysData = fuzzyKeyDataCopy;
this.tracker = new RowTracker();
}

private void preprocessSearchKey(Pair<byte[], byte[]> p) {
if (!UNSAFE_UNALIGNED) {
// do nothing
return;
}
/**
* Zeroes the non-fixed ("don't care") positions of the search key (on both paths) so the
* next-cell hint from {@link #getNextForFuzzyRule} is the smallest matching row. The byte at a
* non-fixed position is never compared, so this affects neither matching nor deserialization.
*/
static void preprocessSearchKey(Pair<byte[], byte[]> p, boolean unsafeUnaligned) {
byte[] key = p.getFirst();
byte[] mask = p.getSecond();
for (int i = 0; i < mask.length; i++) {
// set non-fixed part of a search key to 0.
if (mask[i] == 2) {
// non-fixed is encoded as 2 on the unsafe path ({-1, 2}) and as 1 on no-unsafe ({0, 1})
if ((unsafeUnaligned && mask[i] == 2) || (!unsafeUnaligned && mask[i] == 1)) {
key[i] = 0;
}
}
}

/**
* We need to preprocess mask array, as since we treat 2's as unfixed positions and -1 (0xff) as
* fixed positions
* Normalizes the incoming mask to the active path's encoding. Input is the public {0, 1} form, or
* the already-preprocessed unsafe {-1, 2} form when {@link #parseFrom} deserializes a filter from
* an unsafe peer; accepting both lets a filter serialized on one platform work on the other.
* Unsafe keeps/produces {-1, 2}; no-unsafe keeps/produces {0, 1}.
* @return mask array
*/
private byte[] preprocessMask(byte[] mask) {
if (!UNSAFE_UNALIGNED) {
// do nothing
if (isPreprocessedMask(mask)) {
// deserialized {-1, 2} from an unsafe peer -> restore {0, 1}
for (int i = 0; i < mask.length; i++) {
if (mask[i] == -1) {
mask[i] = 0; // -1 -> 0
} else if (mask[i] == 2) {
mask[i] = 1; // 2 -> 1
}
}
}
return mask;
}
if (isPreprocessedMask(mask)) return mask;
Expand All @@ -161,6 +174,23 @@ private boolean isPreprocessedMask(byte[] mask) {
return true;
}

/**
* Converts a stored mask back to the public {0 (fixed), 1 (non-fixed)} form as a new array, so
* serialization, {@link #getFuzzyKeys}, equals and hashCode never expose the internal encoding.
* No-unsafe already stores {0, 1}; unsafe stores {-1, 0}, where -1 is fixed and anything else is
* non-fixed.
* @return a new array in {0, 1} form
*/
private static byte[] toConstructorMask(byte[] mask, boolean unsafeUnaligned) {
byte[] out = Arrays.copyOf(mask, mask.length);
if (unsafeUnaligned) {
for (int i = 0; i < out.length; i++) {
out[i] = (byte) (out[i] == -1 ? 0 : 1);
}
}
return out;
}

/**
* Returns the Fuzzy keys in the format expected by the constructor.
* @return the Fuzzy keys in the format expected by the constructor
Expand All @@ -171,18 +201,7 @@ public List<Pair<byte[], byte[]>> getFuzzyKeys() {
Pair<byte[], byte[]> returnKey = new Pair<>();
// This won't revert the original key's don't care values, but we don't care.
returnKey.setFirst(Arrays.copyOf(fuzzyKey.getFirst(), fuzzyKey.getFirst().length));
byte[] returnMask = Arrays.copyOf(fuzzyKey.getSecond(), fuzzyKey.getSecond().length);
if (UNSAFE_UNALIGNED && isPreprocessedMask(returnMask)) {
// Revert the preprocessing.
for (int i = 0; i < returnMask.length; i++) {
if (returnMask[i] == -1) {
returnMask[i] = 0; // -1 >> 0
} else if (returnMask[i] == 2) {
returnMask[i] = 1;// 2 >> 1
}
}
}
returnKey.setSecond(returnMask);
returnKey.setSecond(toConstructorMask(fuzzyKey.getSecond(), UNSAFE_UNALIGNED));
returnList.add(returnKey);
}
return returnList;
Expand All @@ -205,10 +224,6 @@ public ReturnCode filterCell(final Cell c) {
for (int i = startIndex; i < size + startIndex; i++) {
final int index = i % size;
Pair<byte[], byte[]> fuzzyData = fuzzyKeysData.get(index);
// This shift is idempotent - always end up with 0 and -1 as mask values.
for (int j = 0; j < fuzzyData.getSecond().length; j++) {
fuzzyData.getSecond()[j] >>= 2;
}
SatisfiesCode satisfiesCode = satisfies(isReversed(), c.getRowArray(), c.getRowOffset(),
c.getRowLength(), fuzzyData.getFirst(), fuzzyData.getSecond());
if (satisfiesCode == SatisfiesCode.YES) {
Expand Down Expand Up @@ -316,9 +331,11 @@ boolean isNextRowSameAs(Cell currentCell) {
}

byte[] updateWith(Cell currentCell, Pair<byte[], byte[]> fuzzyData) {
byte[] nextRowKeyCandidate =
getNextForFuzzyRule(isReversed(), currentCell.getRowArray(), currentCell.getRowOffset(),
currentCell.getRowLength(), fuzzyData.getFirst(), fuzzyData.getSecond());
// getNextForFuzzyRule needs {-1, 0}: a converted copy on no-unsafe, the stored mask on
// unsafe.
byte[] fuzzyKeyMeta = preprocessMaskForHinting(fuzzyData.getSecond(), UNSAFE_UNALIGNED);
byte[] nextRowKeyCandidate = getNextForFuzzyRule(isReversed(), currentCell.getRowArray(),
currentCell.getRowOffset(), currentCell.getRowLength(), fuzzyData.getFirst(), fuzzyKeyMeta);
if (nextRowKeyCandidate != null) {
nextRows.add(new Pair<>(nextRowKeyCandidate, fuzzyData));
}
Expand All @@ -339,7 +356,9 @@ public byte[] toByteArray() {
for (Pair<byte[], byte[]> fuzzyData : fuzzyKeysData) {
BytesBytesPair.Builder bbpBuilder = BytesBytesPair.newBuilder();
bbpBuilder.setFirst(UnsafeByteOperations.unsafeWrap(fuzzyData.getFirst()));
bbpBuilder.setSecond(UnsafeByteOperations.unsafeWrap(fuzzyData.getSecond()));
// Emit the public {0, 1} mask, not the internal form, so the wire is platform-independent.
bbpBuilder.setSecond(UnsafeByteOperations
.unsafeWrap(toConstructorMask(fuzzyData.getSecond(), UNSAFE_UNALIGNED)));
builder.addFuzzyKeysData(bbpBuilder);
}
return builder.build().toByteArray();
Expand Down Expand Up @@ -467,6 +486,41 @@ static SatisfiesCode satisfies(boolean reverse, byte[] row, int offset, int leng
return SatisfiesCode.YES;
}

/**
* Mutates {@code mask} in place into the form {@link #satisfies} expects. Called once from the
* constructor so the stored mask is fixed up front and never mutated during scanning. Unsafe:
* shift {-1, 2} -&gt; {-1, 0} (the word-based satisfies wants non-fixed = 0). No-unsafe: no-op,
* {@link #satisfiesNoUnsafe} already wants {0, 1}.
*/
static void preprocessMaskForSatisfies(byte[] mask, boolean unsafeUnaligned) {
if (!unsafeUnaligned) {
return;
}
for (int i = 0; i < mask.length; i++) {
mask[i] >>= 2;
}
}

/**
* Returns the mask in the {-1 (fixed), 0 (non-fixed)} form {@link #getNextForFuzzyRule} expects.
* No-unsafe converts {0, 1} into a NEW array (the stored {0, 1} is still needed by
* {@link #satisfiesNoUnsafe}); unsafe is already {-1, 0}, returned as is.
*/
static byte[] preprocessMaskForHinting(byte[] mask, boolean unsafeUnaligned) {
if (unsafeUnaligned) {
return mask;
}
byte[] converted = Arrays.copyOf(mask, mask.length);
for (int i = 0; i < converted.length; i++) {
if (converted[i] == 0) {
converted[i] = -1;
} else if (converted[i] == 1) {
converted[i] = 0;
}
}
return converted;
}

static SatisfiesCode satisfiesNoUnsafe(boolean reverse, byte[] row, int offset, int length,
byte[] fuzzyKeyBytes, byte[] fuzzyKeyMeta) {
if (row == null) {
Expand Down Expand Up @@ -713,9 +767,11 @@ boolean areSerializedFieldsEqual(Filter o) {
for (int i = 0; i < fuzzyKeysData.size(); ++i) {
Pair<byte[], byte[]> thisData = this.fuzzyKeysData.get(i);
Pair<byte[], byte[]> otherData = other.fuzzyKeysData.get(i);
// Compare masks in the normalized {0, 1} form, so equality matches the serialized bytes.
if (
!(Bytes.equals(thisData.getFirst(), otherData.getFirst())
&& Bytes.equals(thisData.getSecond(), otherData.getSecond()))
&& Bytes.equals(toConstructorMask(thisData.getSecond(), UNSAFE_UNALIGNED),
toConstructorMask(otherData.getSecond(), UNSAFE_UNALIGNED)))
) {
return false;
}
Expand All @@ -730,6 +786,12 @@ public boolean equals(Object obj) {

@Override
public int hashCode() {
return Objects.hash(this.fuzzyKeysData);
int result = 1;
for (Pair<byte[], byte[]> fuzzyData : fuzzyKeysData) {
result = 31 * result + Bytes.hashCode(fuzzyData.getFirst());
result =
31 * result + Bytes.hashCode(toConstructorMask(fuzzyData.getSecond(), UNSAFE_UNALIGNED));
}
return result;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -391,6 +391,43 @@ public void testReverseFilterListSkipsSameRowFuzzyHint() throws IOException {
}
}

/**
* Serializing a filter must not leak the internal mask encoding. On the unsafe path the stored
* mask is {-1 (fixed), 0 (non-fixed)}; {@code toByteArray} must emit the public constructor {0,
* 1} form so the round-tripped filter behaves identically (otherwise the leaked {-1, 0} is
* reparsed as an all-fixed mask). Processing a cell first must not change this.
*/
@Test
public void testSerializationAfterFilterCellPreservesBehavior() throws Exception {
FuzzyRowFilter original =
new FuzzyRowFilter(Arrays.asList(new Pair<>(new byte[] { 1, 2, 3 }, new byte[] { 0, 1, 0 })));
// Process a cell first to prove serialization is unaffected by scanning.
original.filterCell(KeyValueUtil.createFirstOnRow(new byte[] { 1, 50, 3 }));

FuzzyRowFilter parsed = FuzzyRowFilter.parseFrom(original.toByteArray());
// A row matching only via the wildcard position must still be INCLUDEEd after the round-trip.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Super-minor / nit: typo:

Suggested change
// A row matching only via the wildcard position must still be INCLUDEEd after the round-trip.
// A row matching only via the wildcard position must still be INCLUDED after the round-trip.

assertEquals(Filter.ReturnCode.INCLUDE,
parsed.filterCell(KeyValueUtil.createFirstOnRow(new byte[] { 1, 99, 3 })));
}

/**
* Two filters built from the same rule (distinct array instances) must be equal and hash equally,
* i.e. {@code equals}/{@code hashCode} must be content-based, not identity-based, and consistent
* with each other and with the serialized ({0, 1}) form. This must also hold after one of them
* has processed a cell.
*/
@Test
public void testEqualsConsistentAfterFilterCell() {
FuzzyRowFilter fresh =
new FuzzyRowFilter(Arrays.asList(new Pair<>(new byte[] { 1, 2, 3 }, new byte[] { 0, 1, 0 })));
FuzzyRowFilter scanned =
new FuzzyRowFilter(Arrays.asList(new Pair<>(new byte[] { 1, 2, 3 }, new byte[] { 0, 1, 0 })));
scanned.filterCell(KeyValueUtil.createFirstOnRow(new byte[] { 1, 50, 3 }));

assertEquals(fresh, scanned);
assertEquals(fresh.hashCode(), scanned.hashCode());
}

private static FuzzyRowFilter newReverseFuzzyRowFilter() {
FuzzyRowFilter filter =
new FuzzyRowFilter(Arrays.asList(new Pair<>(Bytes.toBytes("aaa"), new byte[] { 0, 1, 0 })));
Expand Down
Loading