Skip to content

Commit

Permalink
LJpegDecompressor: make 'trailing pixels' workaround less special
Browse files Browse the repository at this point in the history
Looks like not specializing all of the loops but just peeling it
from the innermost loop is good-enough.

```
Comparing /home/lebedevri/rawspeed/build-Clang18-release/src/utilities/rsbench/rsbench-old to /home/lebedevri/rawspeed/build-Clang18-release/src/utilities/rsbench/rsbench
Benchmark                                                                                                                               Time             CPU      Time Old      Time New       CPU Old       CPU New
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
./Adobe DNG Converter/Canon EOS 5D Mark III/5G4A9394-compressed-lossless.DNG/threads:32/process_time/real_time_pvalue                 0.0806          0.0776      U Test, Repetitions: 27 vs 27
./Adobe DNG Converter/Canon EOS 5D Mark III/5G4A9394-compressed-lossless.DNG/threads:32/process_time/real_time_mean                  -0.0149         -0.0146            12            11           367           362
./Adobe DNG Converter/Canon EOS 5D Mark III/5G4A9394-compressed-lossless.DNG/threads:32/process_time/real_time_median                -0.0134         -0.0125            12            11           364           360
./Adobe DNG Converter/Canon EOS 5D Mark III/5G4A9394-compressed-lossless.DNG/threads:32/process_time/real_time_stddev                -0.1154         -0.1142             0             0            12            11
./Adobe DNG Converter/Canon EOS 5D Mark III/5G4A9394-compressed-lossless.DNG/threads:32/process_time/real_time_cv                    -0.1020         -0.1011             0             0             0             0
./Adobe DNG Converter/Canon EOS 5D Mark III/5G4A9395-compressed-lossless.DNG/threads:32/process_time/real_time_pvalue                 0.4675          0.4999      U Test, Repetitions: 27 vs 27
./Adobe DNG Converter/Canon EOS 5D Mark III/5G4A9395-compressed-lossless.DNG/threads:32/process_time/real_time_mean                  -0.0061         -0.0052            15            15           484           481
./Adobe DNG Converter/Canon EOS 5D Mark III/5G4A9395-compressed-lossless.DNG/threads:32/process_time/real_time_median                -0.0127         -0.0130            15            15           483           477
./Adobe DNG Converter/Canon EOS 5D Mark III/5G4A9395-compressed-lossless.DNG/threads:32/process_time/real_time_stddev                -0.3251         -0.3283             1             0            20            14
./Adobe DNG Converter/Canon EOS 5D Mark III/5G4A9395-compressed-lossless.DNG/threads:32/process_time/real_time_cv                    -0.3210         -0.3248             0             0             0             0
./Adobe DNG Converter/Canon EOS 5D Mark IV/B13A0729.dng/threads:32/process_time/real_time_pvalue                                      0.4363          0.3776      U Test, Repetitions: 27 vs 27
./Adobe DNG Converter/Canon EOS 5D Mark IV/B13A0729.dng/threads:32/process_time/real_time_mean                                       -0.0021         -0.0021            30            30           946           945
./Adobe DNG Converter/Canon EOS 5D Mark IV/B13A0729.dng/threads:32/process_time/real_time_median                                     -0.0049         -0.0061            30            30           944           938
./Adobe DNG Converter/Canon EOS 5D Mark IV/B13A0729.dng/threads:32/process_time/real_time_stddev                                     +0.2897         +0.3269             1             1            17            23
./Adobe DNG Converter/Canon EOS 5D Mark IV/B13A0729.dng/threads:32/process_time/real_time_cv                                         +0.2925         +0.3296             0             0             0             0
./Fujifilm/X100S/fujifilm-x100s-daylight-DSCF9505.dng/threads:32/process_time/real_time_pvalue                                        0.0002          0.0001      U Test, Repetitions: 27 vs 27
./Fujifilm/X100S/fujifilm-x100s-daylight-DSCF9505.dng/threads:32/process_time/real_time_mean                                         +0.0377         +0.0377             4             4           127           131
./Fujifilm/X100S/fujifilm-x100s-daylight-DSCF9505.dng/threads:32/process_time/real_time_median                                       +0.0351         +0.0366             4             4           125           129
./Fujifilm/X100S/fujifilm-x100s-daylight-DSCF9505.dng/threads:32/process_time/real_time_stddev                                       +0.2754         +0.2903             0             0             3             4
./Fujifilm/X100S/fujifilm-x100s-daylight-DSCF9505.dng/threads:32/process_time/real_time_cv                                           +0.2292         +0.2435             0             0             0             0
./Samsung/Galaxy S21 Ultra/20230712_115041.dng/threads:32/process_time/real_time_pvalue                                               0.0000          0.0002      U Test, Repetitions: 27 vs 27
./Samsung/Galaxy S21 Ultra/20230712_115041.dng/threads:32/process_time/real_time_mean                                                -0.0016         -0.0011           150           150           150           150
./Samsung/Galaxy S21 Ultra/20230712_115041.dng/threads:32/process_time/real_time_median                                              -0.0013         -0.0010           150           150           150           150
./Samsung/Galaxy S21 Ultra/20230712_115041.dng/threads:32/process_time/real_time_stddev                                              +0.2080         -0.0552             0             0             0             0
./Samsung/Galaxy S21 Ultra/20230712_115041.dng/threads:32/process_time/real_time_cv                                                  +0.2099         -0.0542             0             0             0             0
./Samsung/Galaxy S23 Ultra/20231214_130645.dng/threads:32/process_time/real_time_pvalue                                               0.0448          0.8763      U Test, Repetitions: 27 vs 27
./Samsung/Galaxy S23 Ultra/20231214_130645.dng/threads:32/process_time/real_time_mean                                                -0.0020         +0.0015           583           582          6974          6985
./Samsung/Galaxy S23 Ultra/20231214_130645.dng/threads:32/process_time/real_time_median                                              -0.0022         -0.0018           582           581          7190          7177
./Samsung/Galaxy S23 Ultra/20231214_130645.dng/threads:32/process_time/real_time_stddev                                              +0.2048         -0.0249             3             4          1162          1133
./Samsung/Galaxy S23 Ultra/20231214_130645.dng/threads:32/process_time/real_time_cv                                                  +0.2073         -0.0264             0             0             0             0
./Sony/ILCE-7RM5/7RM5-S35-LosslessCompressedMedium.ARW/threads:32/process_time/real_time_pvalue                                       0.0234          0.0280      U Test, Repetitions: 27 vs 27
./Sony/ILCE-7RM5/7RM5-S35-LosslessCompressedMedium.ARW/threads:32/process_time/real_time_mean                                        -0.0142         -0.0145            29            29           920           907
./Sony/ILCE-7RM5/7RM5-S35-LosslessCompressedMedium.ARW/threads:32/process_time/real_time_median                                      -0.0237         -0.0215            29            29           914           895
./Sony/ILCE-7RM5/7RM5-S35-LosslessCompressedMedium.ARW/threads:32/process_time/real_time_stddev                                      +0.0196         -0.0155             1             1            26            26
./Sony/ILCE-7RM5/7RM5-S35-LosslessCompressedMedium.ARW/threads:32/process_time/real_time_cv                                          +0.0342         -0.0010             0             0             0             0
OVERALL_GEOMEAN                                                                                                                      -0.0006         +0.0007             0             0             1             1

```
  • Loading branch information
LebedevRI committed Mar 25, 2024
1 parent 55fbd20 commit 2a3a9e9
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 39 deletions.
51 changes: 15 additions & 36 deletions src/librawspeed/decompressors/LJpegDecompressor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,7 @@ std::array<uint16_t, N_COMP> LJpegDecompressor::getInitialPreds() const {
return preds;
}

template <int N_COMP, bool WeirdWidth>
template <int N_COMP>
void LJpegDecompressor::decodeRowN(
Array1DRef<uint16_t> outRow, std::array<uint16_t, N_COMP> pred,
std::array<std::reference_wrapper<const PrefixCodeDecoder<>>, N_COMP> ht,
Expand All @@ -187,10 +187,7 @@ void LJpegDecompressor::decodeRowN(
}

// Sometimes we also need to consume one more block, and produce part of it.
if /*constexpr*/ (WeirdWidth) {
// FIXME: evaluate i-cache implications due to this being compile-time.
static_assert(N_COMP > 1 || !WeirdWidth,
"can't want part of 1-pixel-wide block");
if (trailingPixels != 0) {
// Some rather esoteric DNG's have odd dimensions, e.g. width % 2 = 1.
// We may end up needing just part of last N_COMP pixels.
invariant(trailingPixels > 0);
Expand Down Expand Up @@ -218,8 +215,7 @@ void LJpegDecompressor::decodeRowN(
}

// N_COMP == number of components (2, 3 or 4)
template <int N_COMP, bool WeirdWidth>
ByteStream::size_type LJpegDecompressor::decodeN() const {
template <int N_COMP> ByteStream::size_type LJpegDecompressor::decodeN() const {
invariant(mRaw->getCpp() > 0);
invariant(N_COMP > 0);

Expand Down Expand Up @@ -292,7 +288,7 @@ ByteStream::size_type LJpegDecompressor::decodeN() const {
/*index=*/0)
.getAsArray1DRef();

decodeRowN<N_COMP, WeirdWidth>(outRow, pred, ht, bs);
decodeRowN<N_COMP>(outRow, pred, ht, bs);
}

inputStream.skipBytes(bs.getStreamPosition());
Expand All @@ -302,34 +298,17 @@ ByteStream::size_type LJpegDecompressor::decodeN() const {
}

ByteStream::size_type LJpegDecompressor::decode() const {
if (trailingPixels == 0) {
switch (frame.cps) {
case 1:
return decodeN<1>();
case 2:
return decodeN<2>();
case 3:
return decodeN<3>();
case 4:
return decodeN<4>();
default:
__builtin_unreachable();
}
} else /* trailingPixels != 0 */ {
// FIXME: using different function just for one tile likely causes
// i-cache misses and whatnot. Need to check how not splitting it into
// two different functions affects performance of the normal case.
switch (frame.cps) {
// Naturally can't happen for CPS=1.
case 2:
return decodeN<2, /*WeirdWidth=*/true>();
case 3:
return decodeN<3, /*WeirdWidth=*/true>();
case 4:
return decodeN<4, /*WeirdWidth=*/true>();
default:
__builtin_unreachable();
}
switch (frame.cps) {
case 1:
return decodeN<1>();
case 2:
return decodeN<2>();
case 3:
return decodeN<3>();
case 4:
return decodeN<4>();
default:
__builtin_unreachable();
}
}

Expand Down
5 changes: 2 additions & 3 deletions src/librawspeed/decompressors/LJpegDecompressor.h
Original file line number Diff line number Diff line change
Expand Up @@ -75,14 +75,13 @@ class LJpegDecompressor final {
template <int N_COMP>
[[nodiscard]] std::array<uint16_t, N_COMP> getInitialPreds() const;

template <int N_COMP, bool WeirdWidth>
template <int N_COMP>
__attribute__((always_inline)) inline void decodeRowN(
Array1DRef<uint16_t> outRow, std::array<uint16_t, N_COMP> pred,
std::array<std::reference_wrapper<const PrefixCodeDecoder<>>, N_COMP> ht,
BitStreamerJPEG& bs) const;

template <int N_COMP, bool WeirdWidth = false>
[[nodiscard]] ByteStream::size_type decodeN() const;
template <int N_COMP> [[nodiscard]] ByteStream::size_type decodeN() const;

public:
LJpegDecompressor(RawImage img, iRectangle2D imgFrame, Frame frame,
Expand Down

0 comments on commit 2a3a9e9

Please sign in to comment.