From 1d121b1e04369791328fabe8b203068d1dbd13e4 Mon Sep 17 00:00:00 2001 From: akankshamahajan Date: Mon, 20 Nov 2023 10:02:06 -0800 Subject: [PATCH] Make auto_readahead_size default true Summary: Make auto_readahead_size option default true Test Plan: benchmarks and exisiting tests Reviewers: Subscribers: Tasks: Tags: --- file/file_prefetch_buffer.cc | 22 +- file/file_prefetch_buffer.h | 39 +- file/prefetch_test.cc | 386 +----------------- include/rocksdb/options.h | 6 +- .../block_based/block_based_table_iterator.cc | 49 --- .../block_based/block_based_table_iterator.h | 1 - table/block_based/block_based_table_reader.cc | 2 +- table/block_based/block_based_table_reader.h | 11 +- table/block_based/block_prefetcher.cc | 14 +- table/block_based/block_prefetcher.h | 11 - table/block_based/partitioned_filter_block.cc | 2 +- table/block_based/partitioned_index_reader.cc | 2 +- table/table_test.cc | 30 ++ .../behavior_changes/auto_readahead_size.md | 1 + 14 files changed, 69 insertions(+), 507 deletions(-) create mode 100644 unreleased_history/behavior_changes/auto_readahead_size.md diff --git a/file/file_prefetch_buffer.cc b/file/file_prefetch_buffer.cc index d392537b9e5..2bd8c4a8584 100644 --- a/file/file_prefetch_buffer.cc +++ b/file/file_prefetch_buffer.cc @@ -356,6 +356,7 @@ void FilePrefetchBuffer::ReadAheadSizeTuning( uint64_t updated_end_offset = Roundup(start_offset + length + readahead_size, alignment); uint64_t initial_end_offset = updated_end_offset; + uint64_t initial_start_offset = updated_start_offset; // Callback to tune the start and end offsets. if (readaheadsize_cb_ != nullptr && readahead_size > 0) { @@ -365,6 +366,8 @@ void FilePrefetchBuffer::ReadAheadSizeTuning( // read_len will be 0 and there is nothing to read/prefetch. if (updated_start_offset == updated_end_offset) { + UpdateReadAheadTrimmedStat((initial_end_offset - initial_start_offset), + (updated_end_offset - updated_start_offset)); return; } @@ -377,6 +380,8 @@ void FilePrefetchBuffer::ReadAheadSizeTuning( // means data has been already prefetched. if (updated_end_offset <= prev_buf_end_offset) { start_offset = end_offset = prev_buf_end_offset; + UpdateReadAheadTrimmedStat((initial_end_offset - initial_start_offset), + (end_offset - start_offset)); return; } } @@ -404,6 +409,9 @@ void FilePrefetchBuffer::ReadAheadSizeTuning( // offset of next prefetch. bufs_[index].initial_end_offset_ = initial_end_offset; read_len = static_cast(roundup_len - chunk_len); + + UpdateReadAheadTrimmedStat((initial_end_offset - initial_start_offset), + (end_offset - start_offset)); } Status FilePrefetchBuffer::HandleOverlappingData( @@ -449,8 +457,7 @@ Status FilePrefetchBuffer::HandleOverlappingData( uint64_t start_offset = bufs_[second].initial_end_offset_; // Second buffer might be out of bound if first buffer already prefetched // that data. - if (tmp_offset + tmp_length <= bufs_[second].offset_ + second_size && - !IsOffsetOutOfBound(start_offset)) { + if (tmp_offset + tmp_length <= bufs_[second].offset_ + second_size) { size_t read_len = 0; uint64_t end_offset = start_offset, chunk_len = 0; @@ -635,9 +642,6 @@ Status FilePrefetchBuffer::PrefetchAsyncInternal(const IOOptions& opts, // prefetching. uint64_t start_offset2 = bufs_[curr_].initial_end_offset_; - // Second buffer might be out of bound if first buffer already prefetched - // that data. - if (!IsOffsetOutOfBound(start_offset2)) { // Find updated readahead size after tuning size_t read_len2 = 0; uint64_t end_offset2 = start_offset2, chunk_len2 = 0; @@ -653,7 +657,6 @@ Status FilePrefetchBuffer::PrefetchAsyncInternal(const IOOptions& opts, bufs_[second].ClearBuffer(); return s; } - } } } @@ -737,7 +740,6 @@ bool FilePrefetchBuffer::TryReadFromCacheUntracked( return false; } } - UpdateReadAheadSizeForUpperBound(offset, n); s = Prefetch(opts, reader, offset, n + readahead_size_); } if (!s.ok()) { @@ -837,8 +839,6 @@ bool FilePrefetchBuffer::TryReadFromCacheAsyncUntracked( } } - UpdateReadAheadSizeForUpperBound(offset, n); - // Prefetch n + readahead_size_/2 synchronously as remaining // readahead_size_/2 will be prefetched asynchronously. s = PrefetchAsyncInternal(opts, reader, offset, n, readahead_size_ / 2, @@ -919,7 +919,6 @@ Status FilePrefetchBuffer::PrefetchAsync(const IOOptions& opts, explicit_prefetch_submitted_ = false; bool is_eligible_for_prefetching = false; - UpdateReadAheadSizeForUpperBound(offset, n); if (readahead_size_ > 0 && (!implicit_auto_readahead_ || num_file_reads_ >= num_file_reads_for_auto_readahead_)) { @@ -1014,14 +1013,13 @@ Status FilePrefetchBuffer::PrefetchAsync(const IOOptions& opts, start_offset2 = bufs_[curr_].initial_end_offset_; // Second buffer might be out of bound if first buffer already prefetched // that data. - if (!IsOffsetOutOfBound(start_offset2)) { + uint64_t end_offset2 = start_offset2, chunk_len2 = 0; ReadAheadSizeTuning(/*read_curr_block=*/false, /*refit_tail=*/false, /*prev_buf_end_offset=*/end_offset1, second, alignment, /*length=*/0, readahead_size, start_offset2, end_offset2, read_len2, chunk_len2); - } } if (read_len1) { diff --git a/file/file_prefetch_buffer.h b/file/file_prefetch_buffer.h index b0aa1f1c6a7..0c6ba7e6697 100644 --- a/file/file_prefetch_buffer.h +++ b/file/file_prefetch_buffer.h @@ -105,8 +105,7 @@ class FilePrefetchBuffer { size_t readahead_size = 0, size_t max_readahead_size = 0, bool enable = true, bool track_min_offset = false, bool implicit_auto_readahead = false, uint64_t num_file_reads = 0, - uint64_t num_file_reads_for_auto_readahead = 0, - uint64_t upper_bound_offset = 0, FileSystem* fs = nullptr, + uint64_t num_file_reads_for_auto_readahead = 0, FileSystem* fs = nullptr, SystemClock* clock = nullptr, Statistics* stats = nullptr, const std::function& cb = nullptr, FilePrefetchBufferUsage usage = FilePrefetchBufferUsage::kUnknown) @@ -127,7 +126,6 @@ class FilePrefetchBuffer { clock_(clock), stats_(stats), usage_(usage), - upper_bound_offset_(upper_bound_offset), readaheadsize_cb_(cb) { assert((num_file_reads_ >= num_file_reads_for_auto_readahead_ + 1) || (num_file_reads_ == 0)); @@ -296,11 +294,6 @@ class FilePrefetchBuffer { // Callback function passed to underlying FS in case of asynchronous reads. void PrefetchAsyncCallback(const FSReadRequest& req, void* cb_arg); - void ResetUpperBoundOffset(uint64_t upper_bound_offset) { - upper_bound_offset_ = upper_bound_offset; - readahead_size_ = initial_auto_readahead_size_; - } - void TEST_GetBufferOffsetandSize(uint32_t index, uint64_t& offset, size_t& len) { offset = bufs_[index].offset_; @@ -452,25 +445,6 @@ class FilePrefetchBuffer { uint64_t offset, size_t n, Slice* result, Status* status); - void UpdateReadAheadSizeForUpperBound(uint64_t offset, size_t n) { - // Adjust readhahead_size till upper_bound if upper_bound_offset_ is - // set. - if (readahead_size_ > 0 && upper_bound_offset_ > 0 && - upper_bound_offset_ > offset) { - if (upper_bound_offset_ < offset + n + readahead_size_) { - readahead_size_ = (upper_bound_offset_ - offset) - n; - RecordTick(stats_, READAHEAD_TRIMMED); - } - } - } - - inline bool IsOffsetOutOfBound(uint64_t offset) { - if (upper_bound_offset_ > 0) { - return (offset >= upper_bound_offset_); - } - return false; - } - void ReadAheadSizeTuning(bool read_curr_block, bool refit_tail, uint64_t prev_buf_end_offset, uint32_t index, size_t alignment, size_t length, @@ -487,6 +461,13 @@ class FilePrefetchBuffer { } } + void UpdateReadAheadTrimmedStat(size_t initial_length, + size_t updated_length) { + if (initial_length != updated_length) { + RecordTick(stats_, READAHEAD_TRIMMED); + } + } + std::vector bufs_; // curr_ represents the index for bufs_ indicating which buffer is being // consumed currently. @@ -529,10 +510,6 @@ class FilePrefetchBuffer { FilePrefetchBufferUsage usage_; - // upper_bound_offset_ is set when ReadOptions.iterate_upper_bound and - // ReadOptions.auto_readahead_size are set to trim readahead_size upto - // upper_bound_offset_ during prefetching. - uint64_t upper_bound_offset_ = 0; std::function readaheadsize_cb_; }; } // namespace ROCKSDB_NAMESPACE diff --git a/file/prefetch_test.cc b/file/prefetch_test.cc index 2c0e8817a44..b97564129a5 100644 --- a/file/prefetch_test.cc +++ b/file/prefetch_test.cc @@ -1355,10 +1355,6 @@ TEST_P(PrefetchTest, PrefetchWithBlockLookupAutoTuneTest) { cmp_iter->Next(); } - uint64_t readahead_trimmed = - options.statistics->getAndResetTickerCount(READAHEAD_TRIMMED); - ASSERT_GT(readahead_trimmed, 0); - ASSERT_OK(cmp_iter->status()); ASSERT_OK(iter->status()); } @@ -1385,10 +1381,6 @@ TEST_P(PrefetchTest, PrefetchWithBlockLookupAutoTuneTest) { cmp_iter->Next(); } - uint64_t readahead_trimmed = - options.statistics->getAndResetTickerCount(READAHEAD_TRIMMED); - ASSERT_GT(readahead_trimmed, 0); - ASSERT_OK(cmp_iter->status()); ASSERT_OK(iter->status()); } @@ -2357,314 +2349,6 @@ TEST_P(PrefetchTest1, SeekParallelizationTest) { Close(); } -// This test checks if readahead_size is trimmed when upper_bound is reached. -// It tests with different combinations of async_io disabled/enabled, -// readahead_size (implicit and explicit), and num_file_reads_for_auto_readahead -// from 0 to 2. -TEST_P(PrefetchTest, IterReadAheadSizeWithUpperBound) { - if (mem_env_ || encrypted_env_) { - ROCKSDB_GTEST_SKIP("Test requires non-mem or non-encrypted environment"); - return; - } - - // First param is if the mockFS support_prefetch or not - std::shared_ptr fs = - std::make_shared(FileSystem::Default(), false); - - std::unique_ptr env(new CompositeEnvWrapper(env_, fs)); - Options options; - SetGenericOptions(env.get(), /*use_direct_io=*/false, options); - options.statistics = CreateDBStatistics(); - BlockBasedTableOptions table_options; - SetBlockBasedTableOptions(table_options); - options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - - Status s = TryReopen(options); - ASSERT_OK(s); - - Random rnd(309); - WriteBatch batch; - - for (int i = 0; i < 26; i++) { - std::string key = "my_key_"; - - for (int j = 0; j < 10; j++) { - key += char('a' + i); - ASSERT_OK(batch.Put(key, rnd.RandomString(1000))); - } - } - ASSERT_OK(db_->Write(WriteOptions(), &batch)); - - std::string start_key = "my_key_a"; - - std::string end_key = "my_key_"; - for (int j = 0; j < 10; j++) { - end_key += char('a' + 25); - } - - Slice least(start_key.data(), start_key.size()); - Slice greatest(end_key.data(), end_key.size()); - - ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &least, &greatest)); - - int buff_prefetch_count = 0; - - // Try with different num_file_reads_for_auto_readahead from 0 to 3. - for (size_t i = 0; i < 3; i++) { - table_options.num_file_reads_for_auto_readahead = i; - options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - - s = TryReopen(options); - ASSERT_OK(s); - - int buff_count_with_tuning = 0, buff_count_without_tuning = 0; - int keys_with_tuning = 0, keys_without_tuning = 0; - int reseek_keys_with_tuning = 0, reseek_keys_without_tuning = 0; - buff_prefetch_count = 0; - - SyncPoint::GetInstance()->SetCallBack( - "FilePrefetchBuffer::Prefetch:Start", - [&](void*) { buff_prefetch_count++; }); - - SyncPoint::GetInstance()->SetCallBack( - "FilePrefetchBuffer::PrefetchAsyncInternal:Start", - [&](void*) { buff_prefetch_count++; }); - - SyncPoint::GetInstance()->EnableProcessing(); - - ReadOptions ropts; - if (std::get<0>(GetParam())) { - ropts.readahead_size = 32768; - } - if (std::get<1>(GetParam())) { - ropts.async_io = true; - } - - // With tuning readahead_size. - { - ASSERT_OK(options.statistics->Reset()); - Slice ub = Slice("my_key_uuu"); - Slice* ub_ptr = &ub; - ropts.iterate_upper_bound = ub_ptr; - ropts.auto_readahead_size = true; - - auto iter = std::unique_ptr(db_->NewIterator(ropts)); - - // Seek. - { - Slice seek_key = Slice("my_key_aaa"); - iter->Seek(seek_key); - - while (iter->Valid()) { - keys_with_tuning++; - iter->Next(); - } - - uint64_t readahead_trimmed = - options.statistics->getAndResetTickerCount(READAHEAD_TRIMMED); - ASSERT_GT(readahead_trimmed, 0); - buff_count_with_tuning = buff_prefetch_count; - } - - // Reseek with new upper_bound_iterator. - { - ub = Slice("my_key_y"); - Slice reseek_key = Slice("my_key_v"); - iter->Seek(reseek_key); - - while (iter->Valid()) { - iter->Next(); - reseek_keys_with_tuning++; - } - ASSERT_OK(iter->status()); - - uint64_t readahead_trimmed = - options.statistics->getAndResetTickerCount(READAHEAD_TRIMMED); - ASSERT_GT(readahead_trimmed, 0); - ASSERT_GT(reseek_keys_with_tuning, 0); - } - } - - // Without tuning readahead_size - { - Slice ub = Slice("my_key_uuu"); - Slice* ub_ptr = &ub; - ropts.iterate_upper_bound = ub_ptr; - buff_prefetch_count = 0; - ASSERT_OK(options.statistics->Reset()); - ropts.auto_readahead_size = false; - - auto iter = std::unique_ptr(db_->NewIterator(ropts)); - - // Seek. - { - Slice seek_key = Slice("my_key_aaa"); - iter->Seek(seek_key); - - while (iter->Valid()) { - keys_without_tuning++; - iter->Next(); - } - buff_count_without_tuning = buff_prefetch_count; - uint64_t readahead_trimmed = - options.statistics->getAndResetTickerCount(READAHEAD_TRIMMED); - ASSERT_EQ(readahead_trimmed, 0); - } - - // Reseek with new upper_bound_iterator. - { - ub = Slice("my_key_y"); - Slice reseek_key = Slice("my_key_v"); - iter->Seek(reseek_key); - - while (iter->Valid()) { - iter->Next(); - reseek_keys_without_tuning++; - } - ASSERT_OK(iter->status()); - - uint64_t readahead_trimmed = - options.statistics->getAndResetTickerCount(READAHEAD_TRIMMED); - ASSERT_EQ(readahead_trimmed, 0); - ASSERT_GT(reseek_keys_without_tuning, 0); - } - } - - { - // Verify results with and without tuning. - if (std::get<1>(GetParam())) { - // In case of async_io. - ASSERT_GE(buff_count_with_tuning, buff_count_without_tuning); - } else { - ASSERT_EQ(buff_count_without_tuning, buff_count_with_tuning); - } - // Prefetching should happen. - ASSERT_GT(buff_count_without_tuning, 0); - ASSERT_GT(buff_count_with_tuning, 0); - // No of keys should be equal. - ASSERT_EQ(keys_without_tuning, keys_with_tuning); - // No of keys after reseek with new upper bound should be equal. - ASSERT_EQ(reseek_keys_without_tuning, reseek_keys_with_tuning); - } - Close(); - } -} - -// This test checks if readahead_size is trimmed when upper_bound is reached -// during Seek in async_io and it goes for polling without any extra -// prefetching. -TEST_P(PrefetchTest, IterReadAheadSizeWithUpperBoundSeekOnly) { - if (mem_env_ || encrypted_env_) { - ROCKSDB_GTEST_SKIP("Test requires non-mem or non-encrypted environment"); - return; - } - - // First param is if the mockFS support_prefetch or not - std::shared_ptr fs = - std::make_shared(FileSystem::Default(), false); - - bool use_direct_io = false; - if (std::get<0>(GetParam())) { - use_direct_io = true; - } - - std::unique_ptr env(new CompositeEnvWrapper(env_, fs)); - Options options; - SetGenericOptions(env.get(), use_direct_io, options); - options.statistics = CreateDBStatistics(); - BlockBasedTableOptions table_options; - SetBlockBasedTableOptions(table_options); - options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - - Status s = TryReopen(options); - if (use_direct_io && (s.IsNotSupported() || s.IsInvalidArgument())) { - // If direct IO is not supported, skip the test - return; - } else { - ASSERT_OK(s); - } - - Random rnd(309); - WriteBatch batch; - - for (int i = 0; i < 26; i++) { - std::string key = "my_key_"; - - for (int j = 0; j < 10; j++) { - key += char('a' + i); - ASSERT_OK(batch.Put(key, rnd.RandomString(1000))); - } - } - ASSERT_OK(db_->Write(WriteOptions(), &batch)); - - std::string start_key = "my_key_a"; - - std::string end_key = "my_key_"; - for (int j = 0; j < 10; j++) { - end_key += char('a' + 25); - } - - Slice least(start_key.data(), start_key.size()); - Slice greatest(end_key.data(), end_key.size()); - - ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &least, &greatest)); - - s = TryReopen(options); - ASSERT_OK(s); - - int buff_count_with_tuning = 0; - - SyncPoint::GetInstance()->SetCallBack( - "FilePrefetchBuffer::PrefetchAsyncInternal:Start", - [&](void*) { buff_count_with_tuning++; }); - - bool read_async_called = false; - SyncPoint::GetInstance()->SetCallBack( - "UpdateResults::io_uring_result", - [&](void* /*arg*/) { read_async_called = true; }); - - SyncPoint::GetInstance()->EnableProcessing(); - - SyncPoint::GetInstance()->EnableProcessing(); - - ReadOptions ropts; - if (std::get<1>(GetParam())) { - ropts.readahead_size = 32768; - } - ropts.async_io = true; - - Slice ub = Slice("my_key_aaa"); - ropts.iterate_upper_bound = &ub; - Slice seek_key = Slice("my_key_aaa"); - - // With tuning readahead_size. - { - ASSERT_OK(options.statistics->Reset()); - ropts.auto_readahead_size = true; - - auto iter = std::unique_ptr(db_->NewIterator(ropts)); - - iter->Seek(seek_key); - - ASSERT_OK(iter->status()); - - // Verify results. - uint64_t readhahead_trimmed = - options.statistics->getAndResetTickerCount(READAHEAD_TRIMMED); - // Readahead got trimmed. - if (read_async_called) { - ASSERT_GT(readhahead_trimmed, 0); - // Seek called PrefetchAsync to poll the data. - ASSERT_EQ(1, buff_count_with_tuning); - } else { - // async_io disabled. - ASSERT_GE(readhahead_trimmed, 0); - ASSERT_EQ(0, buff_count_with_tuning); - } - } - Close(); -} - namespace { #ifdef GFLAGS const int kMaxArgCount = 100; @@ -3243,7 +2927,7 @@ TEST_F(FilePrefetchBufferTest, SeekWithBlockCacheHit) { std::unique_ptr r; Read(fname, opts, &r); - FilePrefetchBuffer fpb(16384, 16384, true, false, false, 0, 0, 0, fs()); + FilePrefetchBuffer fpb(16384, 16384, true, false, false, 0, 0, fs()); Slice result; // Simulate a seek of 4096 bytes at offset 0. Due to the readahead settings, // it will do two reads of 4096+8192 and 8192 @@ -3292,8 +2976,7 @@ TEST_F(FilePrefetchBufferTest, SeekWithoutAlignment) { FilePrefetchBuffer fpb( /*readahead_size=*/8192, /*max_readahead_size=*/16384, /*enable=*/true, /*track_min_offset=*/false, /*implicit_auto_readahead=*/true, - /*num_file_reads=*/0, /*num_file_reads_for_auto_readahead=*/2, - /*upper_bound_offset=*/0, fs()); + /*num_file_reads=*/0, /*num_file_reads_for_auto_readahead=*/2, fs()); Slice result; // Simulate a seek of half of alignment bytes at offset n. Due to the @@ -3324,8 +3007,7 @@ TEST_F(FilePrefetchBufferTest, SeekWithoutAlignment) { FilePrefetchBuffer fpb( /*readahead_size=*/16384, /*max_readahead_size=*/16384, /*enable=*/true, /*track_min_offset=*/false, /*implicit_auto_readahead=*/false, - /*num_file_reads=*/0, /*num_file_reads_for_auto_readahead=*/2, - /*upper_bound_offset=*/0, fs()); + /*num_file_reads=*/0, /*num_file_reads_for_auto_readahead=*/2, fs()); Slice result; // Simulate a seek of half of alignment bytes at offset n. @@ -3361,8 +3043,7 @@ TEST_F(FilePrefetchBufferTest, NoSyncWithAsyncIO) { FilePrefetchBuffer fpb( /*readahead_size=*/8192, /*max_readahead_size=*/16384, /*enable=*/true, /*track_min_offset=*/false, /*implicit_auto_readahead=*/false, - /*num_file_reads=*/0, /*num_file_reads_for_auto_readahead=*/0, - /*upper_bound_offset=*/0, fs()); + /*num_file_reads=*/0, /*num_file_reads_for_auto_readahead=*/0, fs()); int read_async_called = 0; SyncPoint::GetInstance()->SetCallBack( @@ -3400,63 +3081,6 @@ TEST_F(FilePrefetchBufferTest, NoSyncWithAsyncIO) { ASSERT_EQ(result, async_result); } -// This test checks if during seek in async_io, if first buffer already -// prefetched the data till upper_bound offset, second buffer shouldn't go for -// prefetching. -TEST_F(FilePrefetchBufferTest, IterateUpperBoundTest1) { - std::string fname = "iterate-upperbound-test1"; - Random rand(0); - std::string content = rand.RandomString(32768); - Write(fname, content); - - FileOptions opts; - std::unique_ptr r; - Read(fname, opts, &r); - - FilePrefetchBuffer fpb( - /*readahead_size=*/8192, /*max_readahead_size=*/16384, /*enable=*/true, - /*track_min_offset=*/false, /*implicit_auto_readahead=*/false, - /*num_file_reads=*/0, /*num_file_reads_for_auto_readahead=*/0, - /*upper_bound_offset=*/8000, fs()); - - int read_async_called = 0; - SyncPoint::GetInstance()->SetCallBack( - "FilePrefetchBuffer::ReadAsync", - [&](void* /*arg*/) { read_async_called++; }); - SyncPoint::GetInstance()->EnableProcessing(); - - Slice async_result; - // Simulate a seek of 4000 bytes at offset 3000. Due to the readahead - // settings, it will do 1 read of 4000+1000 (till 8000 - upper bound). - Status s = fpb.PrefetchAsync(IOOptions(), r.get(), 3000, 4000, &async_result); - - // Platforms that don't have IO uring may not support async IO - if (s.IsNotSupported()) { - return; - } - - ASSERT_TRUE(s.IsTryAgain()); - IOOptions io_opts; - io_opts.rate_limiter_priority = Env::IOPriority::IO_LOW; - ASSERT_TRUE(fpb.TryReadFromCacheAsync(io_opts, r.get(), /*offset=*/3000, - /*length=*/4000, &async_result, &s)); - // No sync call should be made. - HistogramData sst_read_micros; - stats()->histogramData(SST_READ_MICROS, &sst_read_micros); - ASSERT_EQ(sst_read_micros.count, 0); - - // Number of async calls should be 1. - // No Prefetching should happen in second buffer as first buffer has already - // prefetched till offset. - ASSERT_EQ(read_async_called, 1); - // Length should be 4000. - ASSERT_EQ(async_result.size(), 4000); - // Data correctness. - Slice result(&content[3000], 4000); - ASSERT_EQ(result.size(), 4000); - ASSERT_EQ(result, async_result); -} - TEST_F(FilePrefetchBufferTest, SyncReadaheadStats) { std::string fname = "seek-with-block-cache-hit"; Random rand(0); @@ -3468,7 +3092,7 @@ TEST_F(FilePrefetchBufferTest, SyncReadaheadStats) { Read(fname, opts, &r); std::shared_ptr stats = CreateDBStatistics(); - FilePrefetchBuffer fpb(8192, 8192, true, false, false, 0, 0, 0, fs(), nullptr, + FilePrefetchBuffer fpb(8192, 8192, true, false, false, 0, 0, fs(), nullptr, stats.get()); Slice result; // Simulate a seek of 4096 bytes at offset 0. Due to the readahead settings, diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h index 415f5a70d02..ae5ed2c2656 100644 --- a/include/rocksdb/options.h +++ b/include/rocksdb/options.h @@ -1737,8 +1737,6 @@ struct ReadOptions { // Default: empty (every table will be scanned) std::function table_filter; - // Experimental - // // If auto_readahead_size is set to true, it will auto tune the readahead_size // during scans internally. // For this feature to enabled, iterate_upper_bound must also be specified. @@ -1748,8 +1746,8 @@ struct ReadOptions { // disabled internally and won't be enabled again if the forward scan // is issued again. // - // Default: false - bool auto_readahead_size = false; + // Default: true + bool auto_readahead_size = true; // *** END options only relevant to iterators or scans *** diff --git a/table/block_based/block_based_table_iterator.cc b/table/block_based/block_based_table_iterator.cc index 93fdd1d6150..d08def5a04f 100644 --- a/table/block_based/block_based_table_iterator.cc +++ b/table/block_based/block_based_table_iterator.cc @@ -114,21 +114,6 @@ void BlockBasedTableIterator::SeekImpl(const Slice* target, } } - if (autotune_readaheadsize) { - FindReadAheadSizeUpperBound(); - if (target) { - index_iter_->Seek(*target); - } else { - index_iter_->SeekToFirst(); - } - - // Check for IO error. - if (!index_iter_->Valid()) { - ResetDataIter(); - return; - } - } - // After reseek, index_iter_ point to the right key i.e. target in // case of readahead_cache_lookup_. So index_iter_ can be used directly. IndexValue v = index_iter_->value(); @@ -691,40 +676,6 @@ void BlockBasedTableIterator::CheckDataBlockWithinUpperBound() { } } -void BlockBasedTableIterator::FindReadAheadSizeUpperBound() { - size_t total_bytes_till_upper_bound = 0; - size_t footer = table_->get_rep()->footer.GetBlockTrailerSize(); - uint64_t start_offset = index_iter_->value().handle.offset(); - - do { - BlockHandle block_handle = index_iter_->value().handle; - total_bytes_till_upper_bound += block_handle.size(); - total_bytes_till_upper_bound += footer; - - // Can't figure out for current block if current block - // is out of bound. But for next block we can find that. - // If curr block's index key >= iterate_upper_bound, it - // means all the keys in next block or above are out of - // bound. - if (IsNextBlockOutOfBound()) { - break; - } - - // Since next block is not out of bound, iterate to that - // index block and add it's Data block size to - // readahead_size. - index_iter_->Next(); - - if (!index_iter_->Valid()) { - break; - } - - } while (true); - - block_prefetcher_.SetUpperBoundOffset(start_offset + - total_bytes_till_upper_bound); -} - void BlockBasedTableIterator::InitializeStartAndEndOffsets( bool read_curr_block, bool& found_first_miss_block, uint64_t& start_updated_offset, uint64_t& end_updated_offset, diff --git a/table/block_based/block_based_table_iterator.h b/table/block_based/block_based_table_iterator.h index 84c83014c9c..55478530508 100644 --- a/table/block_based/block_based_table_iterator.h +++ b/table/block_based/block_based_table_iterator.h @@ -371,7 +371,6 @@ class BlockBasedTableIterator : public InternalIteratorBase { } // *** BEGIN APIs relevant to auto tuning of readahead_size *** - void FindReadAheadSizeUpperBound(); // This API is called to lookup the data blocks ahead in the cache to tune // the start and end offsets passed. diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc index 3856224619e..a184264df17 100644 --- a/table/block_based/block_based_table_reader.cc +++ b/table/block_based/block_based_table_reader.cc @@ -887,7 +887,7 @@ Status BlockBasedTable::PrefetchTail( 0 /* readahead_size */, 0 /* max_readahead_size */, true /* enable */, true /* track_min_offset */, false /* implicit_auto_readahead */, 0 /* num_file_reads */, 0 /* num_file_reads_for_auto_readahead */, - 0 /* upper_bound_offset */, nullptr /* fs */, nullptr /* clock */, stats, + nullptr /* fs */, nullptr /* clock */, stats, /* readahead_cb */ nullptr, FilePrefetchBufferUsage::kTableOpenPrefetchTail)); diff --git a/table/block_based/block_based_table_reader.h b/table/block_based/block_based_table_reader.h index 43ea1602d06..a7a94cd0b61 100644 --- a/table/block_based/block_based_table_reader.h +++ b/table/block_based/block_based_table_reader.h @@ -699,30 +699,29 @@ struct BlockBasedTable::Rep { size_t readahead_size, size_t max_readahead_size, std::unique_ptr* fpb, bool implicit_auto_readahead, uint64_t num_file_reads, uint64_t num_file_reads_for_auto_readahead, - uint64_t upper_bound_offset, + const std::function& readaheadsize_cb, FilePrefetchBufferUsage usage) const { fpb->reset(new FilePrefetchBuffer( readahead_size, max_readahead_size, !ioptions.allow_mmap_reads /* enable */, false /* track_min_offset */, implicit_auto_readahead, num_file_reads, - num_file_reads_for_auto_readahead, upper_bound_offset, - ioptions.fs.get(), ioptions.clock, ioptions.stats, readaheadsize_cb, - usage)); + num_file_reads_for_auto_readahead, ioptions.fs.get(), ioptions.clock, + ioptions.stats, readaheadsize_cb, usage)); } void CreateFilePrefetchBufferIfNotExists( size_t readahead_size, size_t max_readahead_size, std::unique_ptr* fpb, bool implicit_auto_readahead, uint64_t num_file_reads, uint64_t num_file_reads_for_auto_readahead, - uint64_t upper_bound_offset, + const std::function& readaheadsize_cb, FilePrefetchBufferUsage usage = FilePrefetchBufferUsage::kUnknown) const { if (!(*fpb)) { CreateFilePrefetchBuffer(readahead_size, max_readahead_size, fpb, implicit_auto_readahead, num_file_reads, num_file_reads_for_auto_readahead, - upper_bound_offset, readaheadsize_cb, usage); + readaheadsize_cb, usage); } } diff --git a/table/block_based/block_prefetcher.cc b/table/block_based/block_prefetcher.cc index 4e750d7999e..b974f919040 100644 --- a/table/block_based/block_prefetcher.cc +++ b/table/block_based/block_prefetcher.cc @@ -48,7 +48,7 @@ void BlockPrefetcher::PrefetchIfNeeded( compaction_readahead_size_, compaction_readahead_size_, &prefetch_buffer_, /*implicit_auto_readahead=*/false, /*num_file_reads=*/0, /*num_file_reads_for_auto_readahead=*/0, - /*upper_bound_offset=*/0, /*readaheadsize_cb=*/nullptr); + /*readaheadsize_cb=*/nullptr); return; } @@ -57,8 +57,7 @@ void BlockPrefetcher::PrefetchIfNeeded( rep->CreateFilePrefetchBufferIfNotExists( readahead_size, readahead_size, &prefetch_buffer_, /*implicit_auto_readahead=*/false, /*num_file_reads=*/0, - /*num_file_reads_for_auto_readahead=*/0, upper_bound_offset_, - readaheadsize_cb, + /*num_file_reads_for_auto_readahead=*/0, readaheadsize_cb, /*usage=*/FilePrefetchBufferUsage::kUserScanPrefetch); return; } @@ -83,8 +82,7 @@ void BlockPrefetcher::PrefetchIfNeeded( initial_auto_readahead_size_, max_auto_readahead_size, &prefetch_buffer_, /*implicit_auto_readahead=*/true, /*num_file_reads=*/0, - rep->table_options.num_file_reads_for_auto_readahead, - upper_bound_offset_, readaheadsize_cb, + rep->table_options.num_file_reads_for_auto_readahead, readaheadsize_cb, /*usage=*/FilePrefetchBufferUsage::kUserScanPrefetch); return; } @@ -115,8 +113,7 @@ void BlockPrefetcher::PrefetchIfNeeded( rep->CreateFilePrefetchBufferIfNotExists( initial_auto_readahead_size_, max_auto_readahead_size, &prefetch_buffer_, /*implicit_auto_readahead=*/true, num_file_reads_, - rep->table_options.num_file_reads_for_auto_readahead, - upper_bound_offset_, readaheadsize_cb, + rep->table_options.num_file_reads_for_auto_readahead, readaheadsize_cb, /*usage=*/FilePrefetchBufferUsage::kUserScanPrefetch); return; } @@ -138,8 +135,7 @@ void BlockPrefetcher::PrefetchIfNeeded( rep->CreateFilePrefetchBufferIfNotExists( initial_auto_readahead_size_, max_auto_readahead_size, &prefetch_buffer_, /*implicit_auto_readahead=*/true, num_file_reads_, - rep->table_options.num_file_reads_for_auto_readahead, - upper_bound_offset_, readaheadsize_cb, + rep->table_options.num_file_reads_for_auto_readahead, readaheadsize_cb, /*usage=*/FilePrefetchBufferUsage::kUserScanPrefetch); return; } diff --git a/table/block_based/block_prefetcher.h b/table/block_based/block_prefetcher.h index af0a63018e8..e46aaf6144a 100644 --- a/table/block_based/block_prefetcher.h +++ b/table/block_based/block_prefetcher.h @@ -53,15 +53,6 @@ class BlockPrefetcher { &initial_auto_readahead_size_); } - void SetUpperBoundOffset(uint64_t upper_bound_offset) { - upper_bound_offset_ = upper_bound_offset; - if (prefetch_buffer() != nullptr) { - // Upper bound can be changed on reseek. So update that in - // FilePrefetchBuffer. - prefetch_buffer()->ResetUpperBoundOffset(upper_bound_offset); - } - } - private: // Readahead size used in compaction, its value is used only if // lookup_context_.caller = kCompaction. @@ -78,7 +69,5 @@ class BlockPrefetcher { uint64_t prev_offset_ = 0; size_t prev_len_ = 0; std::unique_ptr prefetch_buffer_; - - uint64_t upper_bound_offset_ = 0; }; } // namespace ROCKSDB_NAMESPACE diff --git a/table/block_based/partitioned_filter_block.cc b/table/block_based/partitioned_filter_block.cc index c908db41d35..47ac98b9cd3 100644 --- a/table/block_based/partitioned_filter_block.cc +++ b/table/block_based/partitioned_filter_block.cc @@ -498,7 +498,7 @@ Status PartitionedFilterBlockReader::CacheDependencies( rep->CreateFilePrefetchBuffer( 0, 0, &prefetch_buffer, false /* Implicit autoreadahead */, 0 /*num_reads_*/, 0 /*num_file_reads_for_auto_readahead*/, - /*upper_bound_offset*/ 0, /*readaheadsize_cb*/ nullptr, + /*readaheadsize_cb*/ nullptr, /*usage=*/FilePrefetchBufferUsage::kUnknown); IOOptions opts; diff --git a/table/block_based/partitioned_index_reader.cc b/table/block_based/partitioned_index_reader.cc index f825907180a..b4a16dd2269 100644 --- a/table/block_based/partitioned_index_reader.cc +++ b/table/block_based/partitioned_index_reader.cc @@ -170,7 +170,7 @@ Status PartitionIndexReader::CacheDependencies( rep->CreateFilePrefetchBuffer( 0, 0, &prefetch_buffer, false /*Implicit auto readahead*/, 0 /*num_reads_*/, 0 /*num_file_reads_for_auto_readahead*/, - /*upper_bound_offset*/ 0, /*readaheadsize_cb*/ nullptr, + /*readaheadsize_cb*/ nullptr, /*usage=*/FilePrefetchBufferUsage::kUnknown); IOOptions opts; { diff --git a/table/table_test.cc b/table/table_test.cc index 2904792c249..298e25fbd5b 100644 --- a/table/table_test.cc +++ b/table/table_test.cc @@ -3189,6 +3189,7 @@ TEST_P(BlockBasedTableTest, BlockCacheLookupSeqScans) { Options options; BlockBasedTableOptions table_options = GetBlockBasedTableOptions(); options.create_if_missing = true; + options.statistics = CreateDBStatistics(); table_options.index_type = BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; table_options.block_cache = NewLRUCache(1024 * 1024, 0); @@ -3232,6 +3233,8 @@ TEST_P(BlockBasedTableTest, BlockCacheLookupSeqScans) { "00000255"}; WarmUpCache(&c, moptions, warm_keys); + ASSERT_OK(options.statistics->Reset()); + std::unique_ptr iter(c.GetTableReader()->NewIterator( read_options, moptions.prefix_extractor.get(), /*arena=*/nullptr, /*skip_filters=*/false, TableReaderCaller::kUncategorized)); @@ -3256,6 +3259,9 @@ TEST_P(BlockBasedTableTest, BlockCacheLookupSeqScans) { // One block data. ASSERT_EQ(buffer_len, 4096); ASSERT_EQ(buffer_offset, block_handle.offset()); + + ASSERT_EQ(options.statistics->getAndResetTickerCount(READAHEAD_TRIMMED), + 1); } { @@ -3309,6 +3315,9 @@ TEST_P(BlockBasedTableTest, BlockCacheLookupSeqScans) { bbt->TEST_GetDataBlockHandle(read_options, kv_iter->first, block_handle); ASSERT_EQ(buffer_offset, 106496); ASSERT_EQ(buffer_offset, block_handle.offset()); + + ASSERT_EQ(options.statistics->getAndResetTickerCount(READAHEAD_TRIMMED), + 1); } } c.ResetTableReader(); @@ -3320,6 +3329,7 @@ TEST_P(BlockBasedTableTest, BlockCacheLookupAsyncScansSeek) { std::unique_ptr env( new CompositeEnvWrapper(c.env_, FileSystem::Default())); options.env = env.get(); + options.statistics = CreateDBStatistics(); c.env_ = env.get(); BlockBasedTableOptions table_options = GetBlockBasedTableOptions(); @@ -3369,6 +3379,8 @@ TEST_P(BlockBasedTableTest, BlockCacheLookupAsyncScansSeek) { "00000255"}; WarmUpCache(&c, moptions, warm_keys); + ASSERT_OK(options.statistics->Reset()); + std::unique_ptr iter(c.GetTableReader()->NewIterator( read_options, moptions.prefix_extractor.get(), nullptr, false, TableReaderCaller::kUncategorized)); @@ -3396,6 +3408,9 @@ TEST_P(BlockBasedTableTest, BlockCacheLookupAsyncScansSeek) { prefetch_buffer->TEST_GetBufferOffsetandSize(1, buffer_offset, buffer_len); ASSERT_EQ(buffer_len, 0); + + ASSERT_EQ(options.statistics->getAndResetTickerCount(READAHEAD_TRIMMED), + 2); } { // Check the behavior when it's - @@ -3442,6 +3457,9 @@ TEST_P(BlockBasedTableTest, BlockCacheLookupAsyncScansSeek) { block_handle); ASSERT_EQ(buffer_len, 8192); ASSERT_EQ(buffer_offset, block_handle.offset()); + + ASSERT_EQ(options.statistics->getAndResetTickerCount(READAHEAD_TRIMMED), + 1); } } @@ -3492,6 +3510,9 @@ TEST_P(BlockBasedTableTest, BlockCacheLookupAsyncScansSeek) { block_handle); ASSERT_EQ(buffer_len, 8192); ASSERT_EQ(buffer_offset, block_handle.offset()); + + ASSERT_EQ(options.statistics->getAndResetTickerCount(READAHEAD_TRIMMED), + 1); } // Third prefetch ReadAsync (buffers will swap). @@ -3525,6 +3546,9 @@ TEST_P(BlockBasedTableTest, BlockCacheLookupAsyncScansSeek) { block_handle); ASSERT_EQ(buffer_len, 4096); ASSERT_EQ(buffer_offset, block_handle.offset()); + + ASSERT_EQ(options.statistics->getAndResetTickerCount(READAHEAD_TRIMMED), + 1); } // 4th Prefetch ReadAsync (buffers will swap). @@ -3558,6 +3582,9 @@ TEST_P(BlockBasedTableTest, BlockCacheLookupAsyncScansSeek) { block_handle); ASSERT_EQ(buffer_len, 4096); ASSERT_EQ(buffer_offset, block_handle.offset()); + + ASSERT_EQ(options.statistics->getAndResetTickerCount(READAHEAD_TRIMMED), + 1); } // 5th Prefetch ReadAsync. @@ -3591,6 +3618,9 @@ TEST_P(BlockBasedTableTest, BlockCacheLookupAsyncScansSeek) { block_handle); ASSERT_EQ(buffer_len, 8192); ASSERT_EQ(buffer_offset, block_handle.offset()); + + ASSERT_EQ(options.statistics->getAndResetTickerCount(READAHEAD_TRIMMED), + 0); } } } diff --git a/unreleased_history/behavior_changes/auto_readahead_size.md b/unreleased_history/behavior_changes/auto_readahead_size.md new file mode 100644 index 00000000000..b1c98dd86f9 --- /dev/null +++ b/unreleased_history/behavior_changes/auto_readahead_size.md @@ -0,0 +1 @@ +Make ReadOptions.auto_readahead_size default true which does prefetching optimizations for forward scans if iterate_upper_bound and block_cache is also specified.