Get
SuperVersion* sv = GetAndRefSuperVersion;
SequenceNumber snapshot;
//获取snapshot (目前最大的sequence)
...
bool done = false;
if (!skip_memtable) {
// Get value associated with key
if (get_impl_options.get_value) {
//查询memtable
if (sv->mem->Get(lkey, get_impl_options.value->GetSelf(), &s,
&merge_context, &max_covering_tombstone_seq,
read_options, get_impl_options.callback,
get_impl_options.is_blob_index)) {
done = true;
get_impl_options.value->PinSelf();
RecordTick(stats_, MEMTABLE_HIT);
} else if ((s.ok() || s.IsMergeInProgress()) &&
sv->imm->Get(lkey, get_impl_options.value->GetSelf(), &s,
&merge_context, &max_covering_tombstone_seq,
read_options, get_impl_options.callback,
get_impl_options.is_blob_index)) {
done = true;
get_impl_options.value->PinSelf();
RecordTick(stats_, MEMTABLE_HIT);
}
}
...
memtable get
存在memtable里的key是key+(type and sequence)其中type and seq混合8字节
//先查询bloom filter
if (bloom_filter_) {
// when both memtable_whole_key_filtering and prefix_extractor_ are set,
// only do whole key filtering for Get() to save CPU
if (moptions_.memtable_whole_key_filtering) {
may_contain =
bloom_filter_->MayContain(StripTimestampFromUserKey(user_key, ts_sz));
} else {
assert(prefix_extractor_);
may_contain =
!prefix_extractor_->InDomain(user_key) ||
bloom_filter_->MayContain(prefix_extractor_->Transform(user_key));
}
}
//从memtable里拿
GetFromTable(key, *max_covering_tombstone_seq, do_merge, callback,
is_blob_index, value, s, merge_context, seq,
&found_final_value, &merge_in_progress);
void MemTable::GetFromTable(...){
//构建saver 和回调
Saver saver;
saver.status = s;
saver.found_final_value = found_final_value;
saver.merge_in_progress = merge_in_progress;
saver.key = &key;
saver.value = value;
saver.seq = kMaxSequenceNumber;
saver.mem = this;
saver.merge_context = merge_context;
saver.max_covering_tombstone_seq = max_covering_tombstone_seq;
saver.merge_operator = moptions_.merge_operator;
saver.logger = moptions_.info_log;
saver.inplace_update_support = moptions_.inplace_update_support;
saver.statistics = moptions_.statistics;
saver.env_ = env_;
saver.callback_ = callback;
saver.is_blob_index = is_blob_index;
saver.do_merge = do_merge;
//执行查找
table_->Get(key, &saver, SaveValue);
}
void MemTableRep::Get(const LookupKey& k, void* callback_args,
bool (*callback_func)(void* arg, const char* entry)) {
auto iter = GetDynamicPrefixIterator();
//从skiplist里查找
for (iter->Seek(k.internal_key(), k.memtable_key().data());
iter->Valid() && callback_func(callback_args, iter->key());
iter->Next()) {
}
}
inline void InlineSkipList<Comparator>::Iterator::Seek(const char* target) {
//寻找key和sequence符合条件的 node
//返回小于我们seq的值
//Returns the earliest node with a key >= key.
// Return nullptr if there is no such node.
//key在skip list里从大到小排
//所以查找会找到key >= 我们需要的key
//如果key相等,会按照seq降序排,所以顺序过去一定是seq小于等与我们需要的seq
node_ = list_->FindGreaterOrEqual(target);
}
static bool SaveValue(void* arg, const char* entry) {
Saver* s = reinterpret_cast<Saver*>(arg);
assert(s != nullptr);
MergeContext* merge_context = s->merge_context;
SequenceNumber max_covering_tombstone_seq = s->max_covering_tombstone_seq;
const MergeOperator* merge_operator = s->merge_operator;
assert(merge_context != nullptr);
// entry format is:
// klength varint32
// userkey char[klength-8]
// tag uint64
// vlength varint32f
// value char[vlength]
// Check that it belongs to same user key. We do not check the
// sequence number since the Seek() call above should have skipped
// all entries with overly large sequence numbers.
uint32_t key_length;
const char* key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length);
Slice user_key_slice = Slice(key_ptr, key_length - 8);
//这里因为seek可能找到key大于我们需要的key,此时需要比较一下,如果不想等则直接跳过
if (s->mem->GetInternalKeyComparator()
.user_comparator()
->CompareWithoutTimestamp(user_key_slice, s->key->user_key()) == 0) {
// Correct user key
const uint64_t tag = DecodeFixed64(key_ptr + key_length - 8);
ValueType type;
SequenceNumber seq;
UnPackSequenceAndType(tag, &seq, &type);
// If the value is not in the snapshot, skip it
if (!s->CheckCallback(seq)) {
return true; // to continue to the next seq
}
s->seq = seq;
if ((type == kTypeValue || type == kTypeMerge || type == kTypeBlobIndex) &&
max_covering_tombstone_seq > seq) {
type = kTypeRangeDeletion;
}
switch (type) {
...
//根据type处理key value
}
// s->state could be Corrupt, merge or notfound
return false;
}
ThreadLocalSuperVersion
Rocksdb利用线程局部缓存和atomic来替换掉原先leveldb的version加锁的逻辑
//在读之前需要获得新的superversion(最新的versionset)
SuperVersion* ColumnFamilyData::GetThreadLocalSuperVersion(DBImpl* db) {
//通过swap获得当前的superversion(每个线程都用InUse对象替换tls对象)
//如果没有写,那么在执行ReturnThreadLocalSuperVersion前,tls都保持inuse对象
void* ptr = local_sv_->Swap(SuperVersion::kSVInUse);
// Invariant:
// (1) Scrape (always) installs kSVObsolete in ThreadLocal storage
// (2) the Swap above (always) installs kSVInUse, ThreadLocal storage
// should only keep kSVInUse before ReturnThreadLocalSuperVersion call
// (if no Scrape happens).
assert(ptr != SuperVersion::kSVInUse);
SuperVersion* sv = static_cast<SuperVersion*>(ptr);
//如果刚获取完superversion,就发现已经过期了。那就把这个给删了,直接通过加锁获取当前最新的super version
if (sv == SuperVersion::kSVObsolete ||
sv->version_number != super_version_number_.load()) {
RecordTick(ioptions_.statistics, NUMBER_SUPERVERSION_ACQUIRES);
SuperVersion* sv_to_delete = nullptr;
if (sv && sv->Unref()) {
RecordTick(ioptions_.statistics, NUMBER_SUPERVERSION_CLEANUPS);
db->mutex()->Lock();
// NOTE: underlying resources held by superversion (sst files) might
// not be released until the next background job.
sv->Cleanup();
if (db->immutable_db_options().avoid_unnecessary_blocking_io) {
db->AddSuperVersionsToFreeQueue(sv);
db->SchedulePurge();
} else {
sv_to_delete = sv;
}
} else {
db->mutex()->Lock();
}
//这里一定要加锁,防止在被后台线程作出变更,并获取当前的全局super_version
sv = super_version_->Ref();
db->mutex()->Unlock();
delete sv_to_delete;
}
assert(sv != nullptr);
return sv;
}
ReturnAndCleanupSuperVersion
void DBImpl::ReturnAndCleanupSuperVersion(ColumnFamilyData* cfd,
SuperVersion* sv) {
if (!cfd->ReturnThreadLocalSuperVersion(sv)) {
//将当前的superversion反还给tls,如果此时cas发现换不回去,则说明已经被变更了(写线程修改了所有线程的tls为nullptr)
//清除掉当前保留的旧版本superversion
CleanupSuperVersion(sv);
}
}
bool ColumnFamilyData::ReturnThreadLocalSuperVersion(SuperVersion* sv) {
assert(sv != nullptr);
// Put the SuperVersion back
void* expected = SuperVersion::kSVInUse;
if (local_sv_->CompareAndSwap(static_cast<void*>(sv), expected)) {
// When we see kSVInUse in the ThreadLocal, we are sure ThreadLocal
// storage has not been altered and no Scrape has happened. The
// SuperVersion is still current.
return true;
} else {
// ThreadLocal scrape happened in the process of this GetImpl call (after
// thread local Swap() at the beginning and before CompareAndSwap()).
// This means the SuperVersion it holds is obsolete.
assert(expected == SuperVersion::kSVObsolete);
}
return false;
}
InstallSuperVersion
InstallSuperVersionAndScheduleWork->
void ColumnFamilyData::InstallSuperVersion(
SuperVersionContext* sv_context, InstrumentedMutex* db_mutex,
const MutableCFOptions& mutable_cf_options) {
//外部加锁了
SuperVersion* new_superversion = sv_context->new_superversion.release();
new_superversion->db_mutex = db_mutex;
new_superversion->mutable_cf_options = mutable_cf_options;
new_superversion->Init(mem_, imm_.current(), current_);
SuperVersion* old_superversion = super_version_;
//设置新的suerversion
super_version_ = new_superversion;
++super_version_number_;
super_version_->version_number = super_version_number_;
super_version_->write_stall_condition =
RecalculateWriteStallConditions(mutable_cf_options);
if (old_superversion != nullptr) {
// Reset SuperVersions cached in thread local storage.
// This should be done before old_superversion->Unref(). That's to ensure
// that local_sv_ never holds the last reference to SuperVersion, since
// it has no means to safely do SuperVersion cleanup.
//将其他线程的tls设置为nullptr(SuperVersion::kSVObsolete)
//这个在old_superversion->Unref()之前调用,这样local_sv就不会是最后一个superversion的引用
ResetThreadLocalSuperVersions();
if (old_superversion->mutable_cf_options.write_buffer_size !=
mutable_cf_options.write_buffer_size) {
mem_->UpdateWriteBufferSize(mutable_cf_options.write_buffer_size);
}
if (old_superversion->write_stall_condition !=
new_superversion->write_stall_condition) {
sv_context->PushWriteStallNotification(
old_superversion->write_stall_condition,
new_superversion->write_stall_condition, GetName(), ioptions());
}
//如果这是最后一个对old_superversion的引用,那么就将其清除掉
if (old_superversion->Unref()) {
old_superversion->Cleanup();
sv_context->superversions_to_free.push_back(old_superversion);
}
}
}
rocksdb对leveldb的读优化
Mutex用时也是Atomic的3倍。
rocksdb就是将leveldb里Get()实现中一上来就mutex加锁的操作换成atmoic+线程私有存储的方式来进行优化,优化后读操作基本很少再会有互斥,性能提高不少