LevelDB写流程分析

虽然网上关于leveldb源码分析的文章挺多，但还是想写下来记录自己的再次学习过程，早几年前看过一些leveldb的实现因没有记录博客的文章，所以想再次分析leveldb源码时，看自己记录的东西会比较好些。后面再写几篇不准备更新咯。

本来是准备从头开始分析的，但一般对于数据库，读和写是常用的接口，从读写开始分析整个流程，包括启动时做的事情，宕机时数据库的恢复，读和写放大，以及可能的一致性等，对于性能优化这块，可能要参考连接中的资料，leveldb本身是个单机数据库，不包含网络这块，以及主备一致性等其他额外的功能。

通过leveldb，一方面是学习设计实现，比如根据list+hash实现lru算法，skiplist使用，多线程读写，布隆过滤器，以及关键的lsm模型等，当然也可以看到其中的bug及性能方面的问题，这些只有真正遇到踩坑时才会知道。

首先在开始前，先说明一下用到的key，分为InternalKey和LookupKey，部分代码如下：

134 class InternalKey {
135  private:
136   std::string rep_;
137  public:
138   InternalKey() { }   // Leave rep_ as empty to indicate it is invalid
139   InternalKey(const Slice& user_key, SequenceNumber s, ValueType t) {
140     AppendInternalKey(&rep_, ParsedInternalKey(user_key, s, t));
141   }

 12 static uint64_t PackSequenceAndType(uint64_t seq, ValueType t) {
 13   assert(seq <= kMaxSequenceNumber);
 14   assert(t <= kValueTypeForSeek); 
 15   return (seq << 8) | t;
 16 }   
 17     
 18 void AppendInternalKey(std::string* result, const ParsedInternalKey& key) {
 19   result->append(key.user_key.data(), key.user_key.size());
 20   PutFixed64(result, PackSequenceAndType(key.sequence, key.type));
 21 }

 70 struct ParsedInternalKey {//分解出user_key+seq+type
 71   Slice user_key;
 72   SequenceNumber sequence;
 73   ValueType type;
 76   ParsedInternalKey(const Slice& u, const SequenceNumber& seq, ValueType t)
 77       : user_key(u), sequence(seq), type(t) { }
 79 };

178 // A helper class useful for DBImpl::Get()
179 class LookupKey {
180  public:
181   // Initialize *this for looking up user_key at a snapshot with
182   // the specified sequence number.
183   LookupKey(const Slice& user_key, SequenceNumber sequence);
184 
185   ~LookupKey();
186 
187   // Return a key suitable for lookup in a MemTable.
188   Slice memtable_key() const { return Slice(start_, end_ - start_); }
189 
190   // Return an internal key (suitable for passing to an internal iterator)
191   Slice internal_key() const { return Slice(kstart_, end_ - kstart_); }
192 
193   // Return the user key
194   Slice user_key() const { return Slice(kstart_, end_ - kstart_ - 8); }
195 
196  private:
204   const char* start_;
205   const char* kstart_;
206   const char* end_;
207   char space_[200];      // Avoid allocation for short keys
208 
209   // No copying allowed
210   LookupKey(const LookupKey&);
211   void operator=(const LookupKey&);
212 };

121 LookupKey::LookupKey(const Slice& user_key, SequenceNumber s) {
122   size_t usize = user_key.size();
123   size_t needed = usize + 13;  // A conservative estimate 
124   char* dst;
125   if (needed <= sizeof(space_)) {
126     dst = space_;
127   } else {
128     dst = new char[needed];
129   }
130   start_ = dst;
131   dst = EncodeVarint32(dst, usize + 8);
132   kstart_ = dst;
133   memcpy(dst, user_key.data(), usize);
134   dst += usize;
135   EncodeFixed64(dst, PackSequenceAndType(s, kValueTypeForSeek));
136   dst += 8;
137   end_ = dst; 
138 }

InternalKey由user_key+7字节的seq_number+1字节的type组成；这样做是有一定的好处的，会在后面说明。

以上的实现也比较简单，其中type只有两种类型，kTypeDeletion和kTypeValue，LookupKey构造时，解码成key的大小+key的内容，一个反向过程，由write时编码：

125   virtual void Put(const Slice& key, const Slice& value) {
126     mem_->Add(sequence_, kTypeValue, key, value);
127     sequence_++;
128   }

 82 void MemTable::Add(SequenceNumber s, ValueType type,
 83                    const Slice& key,
 84                    const Slice& value) {
 85   // Format of an entry is concatenation of:
 86   //  key_size     : varint32 of internal_key.size()
 87   //  key bytes    : char[internal_key.size()]
 88   //  value_size   : varint32 of value.size()
 89   //  value bytes  : char[value.size()]
 90   size_t key_size = key.size();
 91   size_t val_size = value.size();
 92   size_t internal_key_size = key_size + 8;//InternalKey的可变长度(key+seq+type)
 93   const size_t encoded_len =          
 94       VarintLength(internal_key_size) + internal_key_size +
 95       VarintLength(val_size) + val_size;//整条内容长度
 96   char* buf = arena_.Allocate(encoded_len);
 97   char* p = EncodeVarint32(buf, internal_key_size);//InternalKey的大小
 98   memcpy(p, key.data(), key_size);//key内容
 99   p += key_size;
100   EncodeFixed64(p, (s << 8) | type);//seq_number +type内容
101   p += 8;
102   p = EncodeVarint32(p, val_size);//value大小
103   memcpy(p, value.data(), val_size);//value内容
104   assert(p + val_size == buf + encoded_len);
105   table_.Insert(buf);
106 }

以上是把某条记录加到skiplist中，这里不管是删除还是修改增加，统一为插入新的记录，后期会进行合并保留最新的。

下面是在写入时的合并类，部分代码：

1487 Status DB::Put(const WriteOptions& opt, const Slice& key, const Slice& value) {
1488   WriteBatch batch;
1489   batch.Put(key, value);
1490   return Write(opt, &batch);
1491 }
1492 
1493 Status DB::Delete(const WriteOptions& opt, const Slice& key) {
1494   WriteBatch batch;
1495   batch.Delete(key);
1496   return Write(opt, &batch);
1497 }

 26 // WriteBatch header has an 8-byte sequence number followed by a 4-byte count.
 27 static const size_t kHeader = 12;

 32 class LEVELDB_EXPORT WriteBatch {
 33  public:
 34   WriteBatch();
 73  private:
 74   friend class WriteBatchInternal;
 75 
 76   std::string rep_;  // See comment in write_batch.cc for the format of rep_
 77 };

102 void WriteBatch::Put(const Slice& key, const Slice& value) {
103   WriteBatchInternal::SetCount(this, WriteBatchInternal::Count(this) + 1);
104   rep_.push_back(static_cast<char>(kTypeValue));
105   PutLengthPrefixedSlice(&rep_, key);
106   PutLengthPrefixedSlice(&rep_, value);
107 }
108 
109 void WriteBatch::Delete(const Slice& key) {//删除也是追加记录
110   WriteBatchInternal::SetCount(this, WriteBatchInternal::Count(this) + 1);
111   rep_.push_back(static_cast<char>(kTypeDeletion));
112   PutLengthPrefixedSlice(&rep_, key);
113 }

 46 Status WriteBatch::Iterate(Handler* handler) const {
 47   Slice input(rep_);
 48   if (input.size() < kHeader) {
 49     return Status::Corruption("malformed WriteBatch (too small)");
 50   }
 51 
 52   input.remove_prefix(kHeader);
 53   Slice key, value;
 54   int found = 0;
 55   while (!input.empty()) {
 56     found++;
 57     char tag = input[0];
 58     input.remove_prefix(1);
 59     switch (tag) {
 60       case kTypeValue://修改
 61         if (GetLengthPrefixedSlice(&input, &key) &&
 62             GetLengthPrefixedSlice(&input, &value)) {
 63           handler->Put(key, value);
 64         } else {
 65           return Status::Corruption("bad WriteBatch Put");
 66         }
 67         break;
 68       case kTypeDeletion://删除
 69         if (GetLengthPrefixedSlice(&input, &key)) {
 70           handler->Delete(key);
 71         } else {
 72           return Status::Corruption("bad WriteBatch Delete");
 73         }
 74         break;
 75       default:
 76         return Status::Corruption("unknown WriteBatch tag");
 77     }
 78   }

119 namespace {
120 class MemTableInserter : public WriteBatch::Handler {
121  public:
122   SequenceNumber sequence_;
123   MemTable* mem_;
124 
125   virtual void Put(const Slice& key, const Slice& value) {
126     mem_->Add(sequence_, kTypeValue, key, value);
127     sequence_++;
128   }
129   virtual void Delete(const Slice& key) {
130     mem_->Add(sequence_, kTypeDeletion, key, Slice());
131     sequence_++;
132   }
133 };
134 }  // namespace

136 Status WriteBatchInternal::InsertInto(const WriteBatch* b,
137                                       MemTable* memtable) {
138   MemTableInserter inserter;
139   inserter.sequence_ = WriteBatchInternal::Sequence(b);
140   inserter.mem_ = memtable;
141   return b->Iterate(&inserter);
142 }

根据以上实现，可知WriteBatch中rep_的格式：seq_number+count+record(type+keysize+keyvaule+valuesize+value)+....+record如果是删除的话则不会value相关的。

下面是写的接口，为了简单化起见，不考虑中间的合并等情况，后面会慢慢分析，单纯的写数据：

1204 Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
1205   Writer w(&mutex_);
1206   w.batch = my_batch;
1207   w.sync = options.sync;
1208   w.done = false;
1209 
1210   MutexLock l(&mutex_);
1211   writers_.push_back(&w);
1212   while (!w.done && &w != writers_.front()) {
1213     w.cv.Wait();
1214   }
1215   if (w.done) {
1216     return w.status;
1217   }
1218   //more code...
1258   while (true) {
1259     Writer* ready = writers_.front();
1260     writers_.pop_front();
1261     if (ready != &w) {
1262       ready->status = status;
1263       ready->done = true;
1264       ready->cv.Signal();
1265     }
1266     if (ready == last_writer) break;
1267   }
1268 
1269   // Notify new head of write queue
1270   if (!writers_.empty()) {
1271     writers_.front()->cv.Signal();
1272   }
1273 
1274   return status;
1275 }

以上是在多线程写的时候，会进入到队列中等待，由其他线程帮忙处理写请求，满足条件(w.done == false and writers_.front() == w)的写线程进行合并写请求；当处理完后会一个个设置Writer的w.done为真并唤醒，之后被唤醒的写线程由于done为真就返回，剩下的再继续写；

1204 Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
1219   // May temporarily unlock and wait.
1220   Status status = MakeRoomForWrite(my_batch == nullptr);
1221   uint64_t last_sequence = versions_->LastSequence();
1222   Writer* last_writer = &w;
1223   if (status.ok() && my_batch != nullptr) {  // nullptr batch is for compactions
1224     WriteBatch* updates = BuildBatchGroup(&last_writer);
1225     WriteBatchInternal::SetSequence(updates, last_sequence + 1);
1226     last_sequence += WriteBatchInternal::Count(updates);
1232     {
1233       mutex_.Unlock();
1234       status = log_->AddRecord(WriteBatchInternal::Contents(updates));
1235       bool sync_error = false;//写log
1236       if (status.ok() && options.sync) {
1237         status = logfile_->Sync();
1238         if (!status.ok()) {
1239           sync_error = true;
1240         }
1241       }
1242       if (status.ok()) {
1243         status = WriteBatchInternal::InsertInto(updates, mem_);
1244       }
1245       mutex_.Lock();
1246       if (sync_error) {
1250         RecordBackgroundError(status);
1251       }
1252     }
1253     if (updates == tmp_batch_) tmp_batch_->Clear();
1254 
1255     versions_->SetLastSequence(last_sequence);
1256   }

这里暂时不分析MakeRoomForWrite的实现；其中BuildBatchGroup会打包写请求：

1279 WriteBatch* DBImpl::BuildBatchGroup(Writer** last_writer) {
1280   mutex_.AssertHeld();
1281   assert(!writers_.empty());
1282   Writer* first = writers_.front();
1283   WriteBatch* result = first->batch;
1284   assert(result != nullptr);
1286   size_t size = WriteBatchInternal::ByteSize(first->batch);
1291   size_t max_size = 1 << 20;
1292   if (size <= (128<<10)) {//调整本次打包的大小
1293     max_size = size + (128<<10);
1294   }
1296   *last_writer = first;
1297   std::deque<Writer*>::iterator iter = writers_.begin();
1298   ++iter;  // Advance past "first"
1299   for (; iter != writers_.end(); ++iter) {//挨个合并
1300     Writer* w = *iter;
1301     if (w->sync && !first->sync) {//遇到同步退出
1302       // Do not include a sync write into a batch handled by a non-sync write.
1303       break;
1304     }
1305 
1306     if (w->batch != nullptr) {
1307       size += WriteBatchInternal::ByteSize(w->batch);
1308       if (size > max_size) {//超过设定大小
1309         // Do not make batch too big
1310         break;
1311       }
1312 
1313       // Append to *result
1314       if (result == first->batch) {
1315         // Switch to temporary batch instead of disturbing caller's batch
1316         result = tmp_batch_;
1317         assert(WriteBatchInternal::Count(result) == 0);
1318         WriteBatchInternal::Append(result, first->batch);
1319       }
1320       WriteBatchInternal::Append(result, w->batch);
1321     }
1322     *last_writer = w;
1323   }
1324   return result;
1325 }

通过以上代码实现，多线程写时，通过锁和条件变量，把自己push到队列并阻塞，由其他队头的写进行可能的合并打包，然后批量先写日志，具体格式后面再分析或参考网上资料，写完日志后，再把记录插入到skiplist后，再更新seq，再依次唤醒队列中的写线程。

这里的写在rocksdb中进行了优化，可以参考下面的链接。

参考资料
基于RocksDB做的key-value分离，解决value写放大严重的问题
 key-value分离论文
 RocksDB 写入流程详解
 rocksdb源码分析写优化之JoinBatchGroup
LevelDB Compaction 引发的 Data Inconsistency
Leveldb源码分析--3
LevelDB设计与实现 - 读写流程

最后编辑于：2021.04.15 15:22:58

人面猴
序言：七十年代末，一起剥皮案震惊了整个滨河市，随后出现的几起案子，更是在滨河造成了极大的恐慌，老刑警刘岩，带你破解...
沈念sama阅读 217,657评论 6赞 505
死咒
序言：滨河连续发生了三起死亡事件，死亡现场离奇诡异，居然都是意外死亡，警方通过查阅死者的电脑和手机，发现死者居然都...
沈念sama阅读 92,889评论 3赞 394
救了他两次的神仙让他今天三更去死
文/潘晓璐我一进店门，熙熙楼的掌柜王于贵愁眉苦脸地迎上来，“玉大人，你说我怎么就摊上这事。” “怎么了？”我有些...
开封第一讲书人阅读 164,057评论 0赞 354
道士缉凶录：失踪的卖姜人
文/不坏的土叔我叫张陵，是天一观的道长。经常有香客问我，道长，这世上最难降的妖魔是什么？我笑而不...
开封第一讲书人阅读 58,509评论 1赞 293
港岛之恋（遗憾婚礼）
正文为了忘掉前任，我火速办了婚礼，结果婚礼上，老公的妹妹穿的比我还像新娘。我一直安慰自己，他们只是感情好，可当我...
茶点故事阅读 67,562评论 6赞 392
恶毒庶女顶嫁案：这布局不是一般人想出来的
文/花漫我一把揭开白布。她就那样静静地躺着，像睡着了一般。火红的嫁衣衬着肌肤如雪。梳的纹丝不乱的头发上，一...
开封第一讲书人阅读 51,443评论 1赞 302
城市分裂传说
那天，我揣着相机与录音，去河边找鬼。笑死，一个胖子当着我的面吹牛，可吹牛的内容都是我干的。我是一名探鬼主播，决...
沈念sama阅读 40,251评论 3赞 418
双鸳鸯连环套：你想象不到人心有多黑
文/苍兰香墨我猛地睁开眼，长吁一口气：“原来是场噩梦啊……” “哼！你这毒妇竟也来了？” 一声冷哼从身侧响起，我...
开封第一讲书人阅读 39,129评论 0赞 276
万荣杀人案实录
序言：老挝万荣一对情侣失踪，失踪者是张志新（化名）和其女友刘颖，没想到半个月后，有当地人在树林里发现了一具尸体，经...
沈念sama阅读 45,561评论 1赞 314
护林员之死
正文独居荒郊野岭守林人离奇死亡，尸身上长有42处带血的脓包…… 初始之章·张勋以下内容为张勋视角年9月15日...
茶点故事阅读 37,779评论 3赞 335
白月光启示录
正文我和宋清朗相恋三年，在试婚纱的时候发现自己被绿了。大学时的朋友给我发了我未婚夫和他白月光在一起吃饭的照片。...
茶点故事阅读 39,902评论 1赞 348
活死人
序言：一个原本活蹦乱跳的男人离奇死亡，死状恐怖，灵堂内的尸体忽然破棺而出，到底是诈尸还是另有隐情，我是刑警宁泽，带...
沈念sama阅读 35,621评论 5赞 345
日本核电站爆炸内幕
正文年R本政府宣布，位于F岛的核电站，受9级特大地震影响，放射性物质发生泄漏。R本人自食恶果不足惜，却给世界环境...
茶点故事阅读 41,220评论 3赞 328
男人毒药：我在死后第九天来索命
文/蒙蒙一、第九天我趴在偏房一处隐蔽的房顶上张望。院中可真热闹，春花似锦、人声如沸。这庄子的主人今日做“春日...
开封第一讲书人阅读 31,838评论 0赞 22
一桩弑父案，背后竟有这般阴谋
文/苍兰香墨我抬头看了看天上的太阳。三九已至，却和暖如春，着一层夹袄步出监牢的瞬间，已是汗流浃背。一阵脚步声响...
开封第一讲书人阅读 32,971评论 1赞 269
情欲美人皮
我被黑心中介骗来泰国打工，没想到刚下飞机就差点儿被人妖公主榨干…… 1. 我叫王不留，地道东北人。一个月前我还...
沈念sama阅读 48,025评论 2赞 370
代替公主和亲
正文我出身青楼，却偏偏与公主长得像，于是被迫代替她去往敌国和亲。传闻我的和亲对象是个残疾皇子，可洞房花烛夜当晚...
茶点故事阅读 44,843评论 2赞 354

LevelDB写流程分析

推荐阅读更多精彩内容