From 2a3a9e91d4f7e4c64039ec72473d29af89f3fd4a Mon Sep 17 00:00:00 2001 From: Roman Lebedev Date: Mon, 25 Mar 2024 02:55:34 +0300 Subject: [PATCH] LJpegDecompressor: make 'trailing pixels' workaround less special Looks like not specializing all of the loops but just peeling it from the innermost loop is good-enough. ``` Comparing /home/lebedevri/rawspeed/build-Clang18-release/src/utilities/rsbench/rsbench-old to /home/lebedevri/rawspeed/build-Clang18-release/src/utilities/rsbench/rsbench Benchmark Time CPU Time Old Time New CPU Old CPU New -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- ./Adobe DNG Converter/Canon EOS 5D Mark III/5G4A9394-compressed-lossless.DNG/threads:32/process_time/real_time_pvalue 0.0806 0.0776 U Test, Repetitions: 27 vs 27 ./Adobe DNG Converter/Canon EOS 5D Mark III/5G4A9394-compressed-lossless.DNG/threads:32/process_time/real_time_mean -0.0149 -0.0146 12 11 367 362 ./Adobe DNG Converter/Canon EOS 5D Mark III/5G4A9394-compressed-lossless.DNG/threads:32/process_time/real_time_median -0.0134 -0.0125 12 11 364 360 ./Adobe DNG Converter/Canon EOS 5D Mark III/5G4A9394-compressed-lossless.DNG/threads:32/process_time/real_time_stddev -0.1154 -0.1142 0 0 12 11 ./Adobe DNG Converter/Canon EOS 5D Mark III/5G4A9394-compressed-lossless.DNG/threads:32/process_time/real_time_cv -0.1020 -0.1011 0 0 0 0 ./Adobe DNG Converter/Canon EOS 5D Mark III/5G4A9395-compressed-lossless.DNG/threads:32/process_time/real_time_pvalue 0.4675 0.4999 U Test, Repetitions: 27 vs 27 ./Adobe DNG Converter/Canon EOS 5D Mark III/5G4A9395-compressed-lossless.DNG/threads:32/process_time/real_time_mean -0.0061 -0.0052 15 15 484 481 ./Adobe DNG Converter/Canon EOS 5D Mark III/5G4A9395-compressed-lossless.DNG/threads:32/process_time/real_time_median -0.0127 -0.0130 15 15 483 477 ./Adobe DNG Converter/Canon EOS 5D Mark III/5G4A9395-compressed-lossless.DNG/threads:32/process_time/real_time_stddev -0.3251 -0.3283 1 0 20 14 ./Adobe DNG Converter/Canon EOS 5D Mark III/5G4A9395-compressed-lossless.DNG/threads:32/process_time/real_time_cv -0.3210 -0.3248 0 0 0 0 ./Adobe DNG Converter/Canon EOS 5D Mark IV/B13A0729.dng/threads:32/process_time/real_time_pvalue 0.4363 0.3776 U Test, Repetitions: 27 vs 27 ./Adobe DNG Converter/Canon EOS 5D Mark IV/B13A0729.dng/threads:32/process_time/real_time_mean -0.0021 -0.0021 30 30 946 945 ./Adobe DNG Converter/Canon EOS 5D Mark IV/B13A0729.dng/threads:32/process_time/real_time_median -0.0049 -0.0061 30 30 944 938 ./Adobe DNG Converter/Canon EOS 5D Mark IV/B13A0729.dng/threads:32/process_time/real_time_stddev +0.2897 +0.3269 1 1 17 23 ./Adobe DNG Converter/Canon EOS 5D Mark IV/B13A0729.dng/threads:32/process_time/real_time_cv +0.2925 +0.3296 0 0 0 0 ./Fujifilm/X100S/fujifilm-x100s-daylight-DSCF9505.dng/threads:32/process_time/real_time_pvalue 0.0002 0.0001 U Test, Repetitions: 27 vs 27 ./Fujifilm/X100S/fujifilm-x100s-daylight-DSCF9505.dng/threads:32/process_time/real_time_mean +0.0377 +0.0377 4 4 127 131 ./Fujifilm/X100S/fujifilm-x100s-daylight-DSCF9505.dng/threads:32/process_time/real_time_median +0.0351 +0.0366 4 4 125 129 ./Fujifilm/X100S/fujifilm-x100s-daylight-DSCF9505.dng/threads:32/process_time/real_time_stddev +0.2754 +0.2903 0 0 3 4 ./Fujifilm/X100S/fujifilm-x100s-daylight-DSCF9505.dng/threads:32/process_time/real_time_cv +0.2292 +0.2435 0 0 0 0 ./Samsung/Galaxy S21 Ultra/20230712_115041.dng/threads:32/process_time/real_time_pvalue 0.0000 0.0002 U Test, Repetitions: 27 vs 27 ./Samsung/Galaxy S21 Ultra/20230712_115041.dng/threads:32/process_time/real_time_mean -0.0016 -0.0011 150 150 150 150 ./Samsung/Galaxy S21 Ultra/20230712_115041.dng/threads:32/process_time/real_time_median -0.0013 -0.0010 150 150 150 150 ./Samsung/Galaxy S21 Ultra/20230712_115041.dng/threads:32/process_time/real_time_stddev +0.2080 -0.0552 0 0 0 0 ./Samsung/Galaxy S21 Ultra/20230712_115041.dng/threads:32/process_time/real_time_cv +0.2099 -0.0542 0 0 0 0 ./Samsung/Galaxy S23 Ultra/20231214_130645.dng/threads:32/process_time/real_time_pvalue 0.0448 0.8763 U Test, Repetitions: 27 vs 27 ./Samsung/Galaxy S23 Ultra/20231214_130645.dng/threads:32/process_time/real_time_mean -0.0020 +0.0015 583 582 6974 6985 ./Samsung/Galaxy S23 Ultra/20231214_130645.dng/threads:32/process_time/real_time_median -0.0022 -0.0018 582 581 7190 7177 ./Samsung/Galaxy S23 Ultra/20231214_130645.dng/threads:32/process_time/real_time_stddev +0.2048 -0.0249 3 4 1162 1133 ./Samsung/Galaxy S23 Ultra/20231214_130645.dng/threads:32/process_time/real_time_cv +0.2073 -0.0264 0 0 0 0 ./Sony/ILCE-7RM5/7RM5-S35-LosslessCompressedMedium.ARW/threads:32/process_time/real_time_pvalue 0.0234 0.0280 U Test, Repetitions: 27 vs 27 ./Sony/ILCE-7RM5/7RM5-S35-LosslessCompressedMedium.ARW/threads:32/process_time/real_time_mean -0.0142 -0.0145 29 29 920 907 ./Sony/ILCE-7RM5/7RM5-S35-LosslessCompressedMedium.ARW/threads:32/process_time/real_time_median -0.0237 -0.0215 29 29 914 895 ./Sony/ILCE-7RM5/7RM5-S35-LosslessCompressedMedium.ARW/threads:32/process_time/real_time_stddev +0.0196 -0.0155 1 1 26 26 ./Sony/ILCE-7RM5/7RM5-S35-LosslessCompressedMedium.ARW/threads:32/process_time/real_time_cv +0.0342 -0.0010 0 0 0 0 OVERALL_GEOMEAN -0.0006 +0.0007 0 0 1 1 ``` --- .../decompressors/LJpegDecompressor.cpp | 51 ++++++------------- .../decompressors/LJpegDecompressor.h | 5 +- 2 files changed, 17 insertions(+), 39 deletions(-) diff --git a/src/librawspeed/decompressors/LJpegDecompressor.cpp b/src/librawspeed/decompressors/LJpegDecompressor.cpp index 9c0602db3..9c4015ef2 100644 --- a/src/librawspeed/decompressors/LJpegDecompressor.cpp +++ b/src/librawspeed/decompressors/LJpegDecompressor.cpp @@ -167,7 +167,7 @@ std::array LJpegDecompressor::getInitialPreds() const { return preds; } -template +template void LJpegDecompressor::decodeRowN( Array1DRef outRow, std::array pred, std::array>, N_COMP> ht, @@ -187,10 +187,7 @@ void LJpegDecompressor::decodeRowN( } // Sometimes we also need to consume one more block, and produce part of it. - if /*constexpr*/ (WeirdWidth) { - // FIXME: evaluate i-cache implications due to this being compile-time. - static_assert(N_COMP > 1 || !WeirdWidth, - "can't want part of 1-pixel-wide block"); + if (trailingPixels != 0) { // Some rather esoteric DNG's have odd dimensions, e.g. width % 2 = 1. // We may end up needing just part of last N_COMP pixels. invariant(trailingPixels > 0); @@ -218,8 +215,7 @@ void LJpegDecompressor::decodeRowN( } // N_COMP == number of components (2, 3 or 4) -template -ByteStream::size_type LJpegDecompressor::decodeN() const { +template ByteStream::size_type LJpegDecompressor::decodeN() const { invariant(mRaw->getCpp() > 0); invariant(N_COMP > 0); @@ -292,7 +288,7 @@ ByteStream::size_type LJpegDecompressor::decodeN() const { /*index=*/0) .getAsArray1DRef(); - decodeRowN(outRow, pred, ht, bs); + decodeRowN(outRow, pred, ht, bs); } inputStream.skipBytes(bs.getStreamPosition()); @@ -302,34 +298,17 @@ ByteStream::size_type LJpegDecompressor::decodeN() const { } ByteStream::size_type LJpegDecompressor::decode() const { - if (trailingPixels == 0) { - switch (frame.cps) { - case 1: - return decodeN<1>(); - case 2: - return decodeN<2>(); - case 3: - return decodeN<3>(); - case 4: - return decodeN<4>(); - default: - __builtin_unreachable(); - } - } else /* trailingPixels != 0 */ { - // FIXME: using different function just for one tile likely causes - // i-cache misses and whatnot. Need to check how not splitting it into - // two different functions affects performance of the normal case. - switch (frame.cps) { - // Naturally can't happen for CPS=1. - case 2: - return decodeN<2, /*WeirdWidth=*/true>(); - case 3: - return decodeN<3, /*WeirdWidth=*/true>(); - case 4: - return decodeN<4, /*WeirdWidth=*/true>(); - default: - __builtin_unreachable(); - } + switch (frame.cps) { + case 1: + return decodeN<1>(); + case 2: + return decodeN<2>(); + case 3: + return decodeN<3>(); + case 4: + return decodeN<4>(); + default: + __builtin_unreachable(); } } diff --git a/src/librawspeed/decompressors/LJpegDecompressor.h b/src/librawspeed/decompressors/LJpegDecompressor.h index 14abeb320..18aef77f0 100644 --- a/src/librawspeed/decompressors/LJpegDecompressor.h +++ b/src/librawspeed/decompressors/LJpegDecompressor.h @@ -75,14 +75,13 @@ class LJpegDecompressor final { template [[nodiscard]] std::array getInitialPreds() const; - template + template __attribute__((always_inline)) inline void decodeRowN( Array1DRef outRow, std::array pred, std::array>, N_COMP> ht, BitStreamerJPEG& bs) const; - template - [[nodiscard]] ByteStream::size_type decodeN() const; + template [[nodiscard]] ByteStream::size_type decodeN() const; public: LJpegDecompressor(RawImage img, iRectangle2D imgFrame, Frame frame,