Skip to content

Commit

Permalink
scale BigDecimal col stats correctly
Browse files Browse the repository at this point in the history
  • Loading branch information
the-other-tim-brown committed Jan 7, 2025
1 parent 2b01cc0 commit c8a825f
Show file tree
Hide file tree
Showing 5 changed files with 148 additions and 30 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@
package org.apache.xtable.delta;

import java.math.BigDecimal;
import java.math.BigInteger;
import java.math.MathContext;
import java.math.RoundingMode;
import java.nio.charset.StandardCharsets;
import java.sql.Date;
import java.text.DateFormat;
Expand Down Expand Up @@ -63,7 +64,7 @@ public static Object convertFromDeltaColumnStatValue(Object value, InternalSchem
return null;
}
if (noConversionForSchema(fieldSchema)) {
return castObjectToInternalType(value, fieldSchema.getDataType());
return castObjectToInternalType(value, fieldSchema);
}
// Needs special handling for date and time.
InternalType fieldType = fieldSchema.getDataType();
Expand Down Expand Up @@ -198,7 +199,8 @@ public static Object convertFromDeltaPartitionValue(
}
}

private static Object castObjectToInternalType(Object value, InternalType valueType) {
private static Object castObjectToInternalType(Object value, InternalSchema schema) {
InternalType valueType = schema.getDataType();
switch (valueType) {
case DOUBLE:
if (value instanceof String)
Expand Down Expand Up @@ -232,7 +234,7 @@ private static Object castObjectToInternalType(Object value, InternalType valueT
}
break;
case DECIMAL:
return numberTypeToBigDecimal(value);
return numberTypeToBigDecimal(value, schema);
case LONG:
if (value instanceof Integer) {
return ((Integer) value).longValue();
Expand All @@ -242,18 +244,12 @@ private static Object castObjectToInternalType(Object value, InternalType valueT
return value;
}

private static BigDecimal numberTypeToBigDecimal(Object value) {
// BigDecimal is parsed as Integer, Long, BigInteger and double if none of the above.
if (value instanceof Integer) {
return BigDecimal.valueOf((Integer) value);
} else if (value instanceof Long) {
return BigDecimal.valueOf((Long) value);
} else if (value instanceof BigInteger) {
return new BigDecimal((BigInteger) value);
} else if (value instanceof Double) {
return BigDecimal.valueOf((Double) value);
} else {
return (BigDecimal) value;
}
private static BigDecimal numberTypeToBigDecimal(Object value, InternalSchema schema) {
// BigDecimal is parsed by converting the value to a string and setting the proper scale and
// precision.
int precision = (int) schema.getMetadata().get(InternalSchema.MetadataKey.DECIMAL_PRECISION);
int scale = (int) schema.getMetadata().get(InternalSchema.MetadataKey.DECIMAL_SCALE);
return new BigDecimal(String.valueOf(value), new MathContext(precision))
.setScale(scale, RoundingMode.UNNECESSARY);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@

import java.math.BigDecimal;
import java.math.BigInteger;
import java.math.RoundingMode;
import java.nio.ByteBuffer;
import java.sql.Date;
import java.util.ArrayList;
Expand Down Expand Up @@ -200,14 +201,16 @@ private static ColumnStat getColumnStatFromHudiStat(
Comparable<?> minValue = HoodieAvroUtils.unwrapAvroValueWrapper(columnStats.getMinValue());
Comparable<?> maxValue = HoodieAvroUtils.unwrapAvroValueWrapper(columnStats.getMaxValue());
if (field.getSchema().getDataType() == InternalType.DECIMAL) {
int scale =
(int) field.getSchema().getMetadata().get(InternalSchema.MetadataKey.DECIMAL_SCALE);
minValue =
minValue instanceof ByteBuffer
? convertBytesToBigDecimal((ByteBuffer) minValue, DECIMAL_WRAPPER_SCALE)
: minValue;
? convertBytesToBigDecimal((ByteBuffer) minValue, scale)
: ((BigDecimal) minValue).setScale(scale, RoundingMode.UNNECESSARY);
maxValue =
maxValue instanceof ByteBuffer
? convertBytesToBigDecimal((ByteBuffer) maxValue, DECIMAL_WRAPPER_SCALE)
: maxValue;
? convertBytesToBigDecimal((ByteBuffer) maxValue, scale)
: ((BigDecimal) maxValue).setScale(scale, RoundingMode.UNNECESSARY);
}
return getColumnStatFromValues(
minValue,
Expand All @@ -221,7 +224,9 @@ private static ColumnStat getColumnStatFromHudiStat(
private static BigDecimal convertBytesToBigDecimal(ByteBuffer value, int scale) {
byte[] bytes = new byte[value.remaining()];
value.duplicate().get(bytes);
return new BigDecimal(new BigInteger(bytes), scale);
BigDecimal serializedValue = new BigDecimal(new BigInteger(bytes), DECIMAL_WRAPPER_SCALE);
// set the scale to match the schema
return serializedValue.setScale(scale, RoundingMode.UNNECESSARY);
}

private static ColumnStat getColumnStatFromColRange(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,9 @@
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertThrows;

import java.math.BigDecimal;
import java.math.MathContext;
import java.math.RoundingMode;
import java.text.DateFormat;
import java.text.ParseException;
import java.text.SimpleDateFormat;
Expand All @@ -33,6 +36,8 @@
import org.junit.jupiter.params.provider.Arguments;
import org.junit.jupiter.params.provider.MethodSource;

import com.google.common.collect.ImmutableMap;

import org.apache.xtable.model.schema.InternalSchema;
import org.apache.xtable.model.schema.InternalType;
import org.apache.xtable.model.schema.PartitionTransformType;
Expand Down Expand Up @@ -214,4 +219,109 @@ private static Stream<Arguments> nonNumericValuesForColStats() {
Arguments.of(Double.NaN, doubleSchema, Double.NaN),
Arguments.of(Double.POSITIVE_INFINITY, doubleSchema, Double.POSITIVE_INFINITY));
}

@ParameterizedTest
@MethodSource("decimalValues")
void parseDecimalValues(
Object deltaValue, InternalSchema fieldSchema, BigDecimal expectedOutput) {
assertEquals(
expectedOutput,
DeltaValueConverter.convertFromDeltaColumnStatValue(deltaValue, fieldSchema));
}

private static Stream<Arguments> decimalValues() {
return Stream.of(
Arguments.of(
-8.00,
InternalSchema.builder()
.name("decimal")
.dataType(InternalType.DECIMAL)
.metadata(
ImmutableMap.<InternalSchema.MetadataKey, Object>builder()
.put(InternalSchema.MetadataKey.DECIMAL_SCALE, 2)
.put(InternalSchema.MetadataKey.DECIMAL_PRECISION, 5)
.build())
.build(),
new BigDecimal("-8.00", new MathContext(5, RoundingMode.UNNECESSARY))
.setScale(2, RoundingMode.UNNECESSARY)),
Arguments.of(
-8.00f,
InternalSchema.builder()
.name("decimal")
.dataType(InternalType.DECIMAL)
.metadata(
ImmutableMap.<InternalSchema.MetadataKey, Object>builder()
.put(InternalSchema.MetadataKey.DECIMAL_SCALE, 2)
.put(InternalSchema.MetadataKey.DECIMAL_PRECISION, 5)
.build())
.build(),
new BigDecimal("-8.00", new MathContext(5, RoundingMode.UNNECESSARY))
.setScale(2, RoundingMode.UNNECESSARY)),
Arguments.of(
1000,
InternalSchema.builder()
.name("decimal")
.dataType(InternalType.DECIMAL)
.metadata(
ImmutableMap.<InternalSchema.MetadataKey, Object>builder()
.put(InternalSchema.MetadataKey.DECIMAL_SCALE, 2)
.put(InternalSchema.MetadataKey.DECIMAL_PRECISION, 6)
.build())
.build(),
new BigDecimal("1000.00", new MathContext(6, RoundingMode.UNNECESSARY))
.setScale(2, RoundingMode.UNNECESSARY)),
Arguments.of(
1000L,
InternalSchema.builder()
.name("decimal")
.dataType(InternalType.DECIMAL)
.metadata(
ImmutableMap.<InternalSchema.MetadataKey, Object>builder()
.put(InternalSchema.MetadataKey.DECIMAL_SCALE, 2)
.put(InternalSchema.MetadataKey.DECIMAL_PRECISION, 6)
.build())
.build(),
new BigDecimal("1000.00", new MathContext(6, RoundingMode.UNNECESSARY))
.setScale(2, RoundingMode.UNNECESSARY)),
Arguments.of(
"1000",
InternalSchema.builder()
.name("decimal")
.dataType(InternalType.DECIMAL)
.metadata(
ImmutableMap.<InternalSchema.MetadataKey, Object>builder()
.put(InternalSchema.MetadataKey.DECIMAL_SCALE, 2)
.put(InternalSchema.MetadataKey.DECIMAL_PRECISION, 6)
.build())
.build(),
new BigDecimal("1000.00", new MathContext(6, RoundingMode.UNNECESSARY))
.setScale(2, RoundingMode.UNNECESSARY)),
Arguments.of(
1234.56,
InternalSchema.builder()
.name("decimal")
.dataType(InternalType.DECIMAL)
.metadata(
ImmutableMap.<InternalSchema.MetadataKey, Object>builder()
.put(InternalSchema.MetadataKey.DECIMAL_SCALE, 2)
.put(InternalSchema.MetadataKey.DECIMAL_PRECISION, 6)
.build())
.build(),
new BigDecimal("1234.56", new MathContext(6, RoundingMode.UNNECESSARY))
.setScale(2, RoundingMode.UNNECESSARY)),
Arguments.of(
new BigDecimal("1234.56", new MathContext(6, RoundingMode.UNNECESSARY))
.setScale(2, RoundingMode.UNNECESSARY),
InternalSchema.builder()
.name("decimal")
.dataType(InternalType.DECIMAL)
.metadata(
ImmutableMap.<InternalSchema.MetadataKey, Object>builder()
.put(InternalSchema.MetadataKey.DECIMAL_SCALE, 2)
.put(InternalSchema.MetadataKey.DECIMAL_PRECISION, 6)
.build())
.build(),
new BigDecimal("1234.56", new MathContext(6, RoundingMode.UNNECESSARY))
.setScale(2, RoundingMode.UNNECESSARY)));
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -286,12 +286,8 @@ private void validateOutput(List<InternalDataFile> output) {
assertEquals(1, decimalColumnStat.getNumNulls());
assertEquals(2, decimalColumnStat.getNumValues());
assertTrue(decimalColumnStat.getTotalSize() > 0);
assertEquals(
new BigDecimal("1234.56"),
((BigDecimal) decimalColumnStat.getRange().getMinValue()).setScale(2));
assertEquals(
new BigDecimal("1234.56"),
((BigDecimal) decimalColumnStat.getRange().getMaxValue()).setScale(2));
assertEquals(new BigDecimal("1234.56"), decimalColumnStat.getRange().getMinValue());
assertEquals(new BigDecimal("1234.56"), decimalColumnStat.getRange().getMaxValue());
}

private HoodieRecord<HoodieAvroPayload> buildRecord(GenericRecord record) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@
import lombok.AccessLevel;
import lombok.NoArgsConstructor;

import com.google.common.collect.ImmutableMap;

import org.apache.xtable.model.schema.InternalField;
import org.apache.xtable.model.schema.InternalSchema;
import org.apache.xtable.model.schema.InternalType;
Expand Down Expand Up @@ -174,7 +176,16 @@ public class ColumnStatMapUtil {
private static final InternalField DECIMAL_FIELD =
InternalField.builder()
.name("decimal_field")
.schema(InternalSchema.builder().name("decimal").dataType(InternalType.DECIMAL).build())
.schema(
InternalSchema.builder()
.name("decimal")
.dataType(InternalType.DECIMAL)
.metadata(
ImmutableMap.<InternalSchema.MetadataKey, Object>builder()
.put(InternalSchema.MetadataKey.DECIMAL_SCALE, 2)
.put(InternalSchema.MetadataKey.DECIMAL_PRECISION, 5)
.build())
.build())
.build();

private static final InternalField FLOAT_FIELD =
Expand Down Expand Up @@ -312,7 +323,7 @@ public static List<ColumnStat> getColumnStats() {
ColumnStat.builder()
.field(DECIMAL_FIELD)
.numNulls(1)
.range(Range.vector(new BigDecimal("1.0"), new BigDecimal("2.0")))
.range(Range.vector(new BigDecimal("1.00"), new BigDecimal("2.00")))
.numValues(50)
.totalSize(123)
.build();
Expand Down

0 comments on commit c8a825f

Please sign in to comment.