leveldb file io 3 write
这里主要看下WritableFile
下的细节,这里只有四个方法
1
2
3
4
virtual Status Append(const Slice& data) = 0;
virtual Status Close() = 0;
virtual Status Flush() = 0;
virtual Status Sync() = 0;
appendfile和writable file都是一类file,区别在于oflag不一致, O_TRUNC
表示如果文件存在,就将文件长度截断为0
1
2
3
4
5
6
7
// append
int fd = ::open(filename.c_str(),
O_TRUNC | O_WRONLY | O_CREAT | kOpenBaseFlags, 0644);
// wirte
int fd = ::open(filename.c_str(),
O_APPEND | O_WRONLY | O_CREAT | kOpenBaseFlags, 0644);
在append函数下,默认有个65536=
如果遇到大量短数据的写入,也会通过memcpy来拷贝数据到buffer中,然后再写入磁盘文件,减少对底层文件系统的调用次数,提高写操作的效率。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
Status Append(const Slice& data) override {
size_t write_size = data.size();
const char* write_data = data.data();
// Fit as much as possible into buffer.
size_t copy_size = std::min(write_size, kWritableFileBufferSize - pos_);
std::memcpy(buf_ + pos_, write_data, copy_size);
write_data += copy_size;
write_size -= copy_size;
pos_ += copy_size;
if (write_size == 0) {
return Status::OK();
}
// Can't fit in buffer, so need to do at least one write.
Status status = FlushBuffer();
if (!status.ok()) {
return status;
}
// Small writes go to buffer, large writes are written directly.
if (write_size < kWritableFileBufferSize) {
std::memcpy(buf_, write_data, write_size);
pos_ = write_size;
return Status::OK();
}
return WriteUnbuffered(write_data, write_size);
}
WriteUnbuffered
的实现就是通过系统调用write实现
1
2
3
4
5
6
7
8
9
10
11
12
13
14
Status WriteUnbuffered(const char* data, size_t size) {
while (size > 0) {
ssize_t write_result = ::write(fd_, data, size);
if (write_result < 0) {
if (errno == EINTR) {
continue; // Retry
}
return PosixError(filename_, errno);
}
data += write_result;
size -= write_result;
}
return Status::OK();
}
WritableFile 还提供了 Flush 接口,用于将内存缓冲区 buf_ 的数据写入文件,它内部也是通过调用 WriteUnbuffered 来实现。不过值得注意的是,这里 Flush 写磁盘成功,并不保证数据已经写入磁盘,甚至不能保证磁盘有足够的空间来存储内容。
如果要做数据写磁盘成功的保证,需要调用sync()
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
Status Sync() override {
// Ensure new files referred to by the manifest are in the filesystem.
//
// This needs to happen before the manifest file is flushed to disk, to
// avoid crashing in a state where the manifest refers to files that are not
// yet on disk.
Status status = SyncDirIfManifest();
if (!status.ok()) {
return status;
}
status = FlushBuffer();
if (!status.ok()) {
return status;
}
return SyncFd(fd_, filename_);
}
Status SyncDirIfManifest() {
Status status;
if (!is_manifest_) {
return status;
}
int fd = ::open(dirname_.c_str(), O_RDONLY | kOpenBaseFlags);
if (fd < 0) {
status = PosixError(dirname_, errno);
} else {
status = SyncFd(fd, dirname_);
::close(fd);
}
return status;
}
// Ensures that all the caches associated with the given file descriptor's
// data are flushed all the way to durable media, and can withstand power
// failures.
//
// The path argument is only used to populate the description string in the
// returned Status if an error occurs.
static Status SyncFd(int fd, const std::string& fd_path) {
#if HAVE_FULLFSYNC
// On macOS and iOS, fsync() doesn't guarantee durability past power
// failures. fcntl(F_FULLFSYNC) is required for that purpose. Some
// filesystems don't support fcntl(F_FULLFSYNC), and require a fallback to
// fsync().
if (::fcntl(fd, F_FULLFSYNC) == 0) {
return Status::OK();
}
#endif // HAVE_FULLFSYNC
#if HAVE_FDATASYNC
bool sync_success = ::fdatasync(fd) == 0;
#else
bool sync_success = ::fsync(fd) == 0;
#endif // HAVE_FDATASYNC
if (sync_success) {
return Status::OK();
}
return PosixError(fd_path, errno);
}
这里核心是调用 SyncFd() 方法,确保文件描述符 fd 关联的所有缓冲数据都被同步到物理磁盘。
在darwin下使用了fcntl()
函数的F_FULLFSYNC
选项来确保数据被同步到物理磁盘。如果定义了HAVE_FDATASYNC
,将使用 fdatasync()
来同步数据。其他情况下,默认使用fsync()
函数来实现同样的功能。
SyncDirIfManifest
确保如果文件是manifest
文件(以MANIFEST
开始命名的文件),相关的目录更改也得到同步。
mainfest 文件记录数据库文件的元数据,包括版本信息、合并操作、数据库状态等关键信息。
文件系统在创建新文件或修改文件目录项时,这些变更可能并不立即写入磁盘。在更新manifest
文件前确保所在目录的数据已被同步到磁盘,防止系统崩溃时,manifest
文件引用的文件尚未真正写入磁盘。