From bfbdf9732ccb0f164e5f91d59ae279a27c92849b Mon Sep 17 00:00:00 2001 From: theWangs <1428148408@qq.com> Date: Mon, 9 Jun 2025 11:24:54 +0800 Subject: [PATCH] =?UTF-8?q?win=E5=AD=90=E7=9B=AE=E5=BD=95=E6=96=B0?= =?UTF-8?q?=E5=A2=9E=E5=8A=9F=E8=83=BD=E6=96=87=E4=BB=B6io=5Fwin?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- storage/rocksdb/rocksdb/port/win/io_win.cc | 1073 ++++++++++++++++++++ storage/rocksdb/rocksdb/port/win/io_win.h | 456 +++++++++ 2 files changed, 1529 insertions(+) create mode 100644 storage/rocksdb/rocksdb/port/win/io_win.cc create mode 100644 storage/rocksdb/rocksdb/port/win/io_win.h diff --git a/storage/rocksdb/rocksdb/port/win/io_win.cc b/storage/rocksdb/rocksdb/port/win/io_win.cc new file mode 100644 index 000000000..f8d1c3dbb --- /dev/null +++ b/storage/rocksdb/rocksdb/port/win/io_win.cc @@ -0,0 +1,1073 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#if defined(OS_WIN) + +#include "port/win/io_win.h" + +#include "monitoring/iostats_context_imp.h" +#include "test_util/sync_point.h" +#include "util/aligned_buffer.h" +#include "util/coding.h" + +namespace ROCKSDB_NAMESPACE { +namespace port { + +/* +* DirectIOHelper +*/ +namespace { + +const size_t kSectorSize = 512; + +inline +bool IsPowerOfTwo(const size_t alignment) { + return ((alignment) & (alignment - 1)) == 0; +} + +inline +bool IsSectorAligned(const size_t off) { + return (off & (kSectorSize - 1)) == 0; +} + +inline +bool IsAligned(size_t alignment, const void* ptr) { + return ((uintptr_t(ptr)) & (alignment - 1)) == 0; +} +} + + +std::string GetWindowsErrSz(DWORD err) { + LPSTR lpMsgBuf; + FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | + FORMAT_MESSAGE_IGNORE_INSERTS, + NULL, err, + 0, // Default language + reinterpret_cast(&lpMsgBuf), 0, NULL); + + std::string Err = lpMsgBuf; + LocalFree(lpMsgBuf); + return Err; +} + +// We preserve the original name of this interface to denote the original idea +// behind it. +// All reads happen by a specified offset and pwrite interface does not change +// the position of the file pointer. Judging from the man page and errno it does +// execute +// lseek atomically to return the position of the file back where it was. +// WriteFile() does not +// have this capability. Therefore, for both pread and pwrite the pointer is +// advanced to the next position +// which is fine for writes because they are (should be) sequential. +// Because all the reads/writes happen by the specified offset, the caller in +// theory should not +// rely on the current file offset. +Status pwrite(const WinFileData* file_data, const Slice& data, + uint64_t offset, size_t& bytes_written) { + + Status s; + bytes_written = 0; + + size_t num_bytes = data.size(); + if (num_bytes > std::numeric_limits::max()) { + // May happen in 64-bit builds where size_t is 64-bits but + // long is still 32-bit, but that's the API here at the moment + return Status::InvalidArgument("num_bytes is too large for a single write: " + + file_data->GetName()); + } + + OVERLAPPED overlapped = { 0 }; + ULARGE_INTEGER offsetUnion; + offsetUnion.QuadPart = offset; + + overlapped.Offset = offsetUnion.LowPart; + overlapped.OffsetHigh = offsetUnion.HighPart; + + DWORD bytesWritten = 0; + + if (FALSE == WriteFile(file_data->GetFileHandle(), data.data(), static_cast(num_bytes), + &bytesWritten, &overlapped)) { + auto lastError = GetLastError(); + s = IOErrorFromWindowsError("WriteFile failed: " + file_data->GetName(), + lastError); + } else { + bytes_written = bytesWritten; + } + + return s; +} + +// See comments for pwrite above +Status pread(const WinFileData* file_data, char* src, size_t num_bytes, + uint64_t offset, size_t& bytes_read) { + + Status s; + bytes_read = 0; + + if (num_bytes > std::numeric_limits::max()) { + return Status::InvalidArgument("num_bytes is too large for a single read: " + + file_data->GetName()); + } + + OVERLAPPED overlapped = { 0 }; + ULARGE_INTEGER offsetUnion; + offsetUnion.QuadPart = offset; + + overlapped.Offset = offsetUnion.LowPart; + overlapped.OffsetHigh = offsetUnion.HighPart; + + DWORD bytesRead = 0; + + if (FALSE == ReadFile(file_data->GetFileHandle(), src, static_cast(num_bytes), + &bytesRead, &overlapped)) { + auto lastError = GetLastError(); + // EOF is OK with zero bytes read + if (lastError != ERROR_HANDLE_EOF) { + s = IOErrorFromWindowsError("ReadFile failed: " + file_data->GetName(), + lastError); + } + } else { + bytes_read = bytesRead; + } + + return s; +} + +// SetFileInformationByHandle() is capable of fast pre-allocates. +// However, this does not change the file end position unless the file is +// truncated and the pre-allocated space is not considered filled with zeros. +Status fallocate(const std::string& filename, HANDLE hFile, + uint64_t to_size) { + Status status; + + FILE_ALLOCATION_INFO alloc_info; + alloc_info.AllocationSize.QuadPart = to_size; + + if (!SetFileInformationByHandle(hFile, FileAllocationInfo, &alloc_info, + sizeof(FILE_ALLOCATION_INFO))) { + auto lastError = GetLastError(); + status = IOErrorFromWindowsError( + "Failed to pre-allocate space: " + filename, lastError); + } + + return status; +} + +Status ftruncate(const std::string& filename, HANDLE hFile, + uint64_t toSize) { + Status status; + + FILE_END_OF_FILE_INFO end_of_file; + end_of_file.EndOfFile.QuadPart = toSize; + + if (!SetFileInformationByHandle(hFile, FileEndOfFileInfo, &end_of_file, + sizeof(FILE_END_OF_FILE_INFO))) { + auto lastError = GetLastError(); + status = IOErrorFromWindowsError("Failed to Set end of file: " + filename, + lastError); + } + + return status; +} + +size_t GetUniqueIdFromFile(HANDLE /*hFile*/, char* /*id*/, + size_t /*max_size*/) { + // Returning 0 is safe as it causes the table reader to generate a unique ID. + // This is suboptimal for performance as it prevents multiple table readers + // for the same file from sharing cached blocks. For example, if users have + // a low value for `max_open_files`, there can be many table readers opened + // for the same file. + // + // TODO: this is a temporarily solution as it is safe but not optimal for + // performance. For more details see discussion in + // https://github.com/facebook/rocksdb/pull/5844. + return 0; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// +// WinMmapReadableFile + +WinMmapReadableFile::WinMmapReadableFile(const std::string& fileName, + HANDLE hFile, HANDLE hMap, + const void* mapped_region, + size_t length) + : WinFileData(fileName, hFile, false /* use_direct_io */), + hMap_(hMap), + mapped_region_(mapped_region), + length_(length) {} + +WinMmapReadableFile::~WinMmapReadableFile() { + BOOL ret __attribute__((__unused__)); + ret = ::UnmapViewOfFile(mapped_region_); + assert(ret); + + ret = ::CloseHandle(hMap_); + assert(ret); +} + +Status WinMmapReadableFile::Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const { + Status s; + + if (offset > length_) { + *result = Slice(); + return IOError(filename_, EINVAL); + } else if (offset + n > length_) { + n = length_ - static_cast(offset); + } + *result = + Slice(reinterpret_cast(mapped_region_)+offset, n); + return s; +} + +Status WinMmapReadableFile::InvalidateCache(size_t offset, size_t length) { + return Status::OK(); +} + +size_t WinMmapReadableFile::GetUniqueId(char* id, size_t max_size) const { + return GetUniqueIdFromFile(hFile_, id, max_size); +} + +/////////////////////////////////////////////////////////////////////////////// +/// WinMmapFile + + +// Can only truncate or reserve to a sector size aligned if +// used on files that are opened with Unbuffered I/O +Status WinMmapFile::TruncateFile(uint64_t toSize) { + return ftruncate(filename_, hFile_, toSize); +} + +Status WinMmapFile::UnmapCurrentRegion() { + Status status; + + if (mapped_begin_ != nullptr) { + if (!::UnmapViewOfFile(mapped_begin_)) { + status = IOErrorFromWindowsError( + "Failed to unmap file view: " + filename_, GetLastError()); + } + + // Move on to the next portion of the file + file_offset_ += view_size_; + + // UnmapView automatically sends data to disk but not the metadata + // which is good and provides some equivalent of fdatasync() on Linux + // therefore, we donot need separate flag for metadata + mapped_begin_ = nullptr; + mapped_end_ = nullptr; + dst_ = nullptr; + + last_sync_ = nullptr; + pending_sync_ = false; + } + + return status; +} + +Status WinMmapFile::MapNewRegion() { + + Status status; + + assert(mapped_begin_ == nullptr); + + size_t minDiskSize = static_cast(file_offset_) + view_size_; + + if (minDiskSize > reserved_size_) { + status = Allocate(file_offset_, view_size_); + if (!status.ok()) { + return status; + } + } + + // Need to remap + if (hMap_ == NULL || reserved_size_ > mapping_size_) { + + if (hMap_ != NULL) { + // Unmap the previous one + BOOL ret __attribute__((__unused__)); + ret = ::CloseHandle(hMap_); + assert(ret); + hMap_ = NULL; + } + + ULARGE_INTEGER mappingSize; + mappingSize.QuadPart = reserved_size_; + + hMap_ = CreateFileMappingA( + hFile_, + NULL, // Security attributes + PAGE_READWRITE, // There is not a write only mode for mapping + mappingSize.HighPart, // Enable mapping the whole file but the actual + // amount mapped is determined by MapViewOfFile + mappingSize.LowPart, + NULL); // Mapping name + + if (NULL == hMap_) { + return IOErrorFromWindowsError( + "WindowsMmapFile failed to create file mapping for: " + filename_, + GetLastError()); + } + + mapping_size_ = reserved_size_; + } + + ULARGE_INTEGER offset; + offset.QuadPart = file_offset_; + + // View must begin at the granularity aligned offset + mapped_begin_ = reinterpret_cast( + MapViewOfFileEx(hMap_, FILE_MAP_WRITE, offset.HighPart, offset.LowPart, + view_size_, NULL)); + + if (!mapped_begin_) { + status = IOErrorFromWindowsError( + "WindowsMmapFile failed to map file view: " + filename_, + GetLastError()); + } else { + mapped_end_ = mapped_begin_ + view_size_; + dst_ = mapped_begin_; + last_sync_ = mapped_begin_; + pending_sync_ = false; + } + return status; +} + +Status WinMmapFile::PreallocateInternal(uint64_t spaceToReserve) { + return fallocate(filename_, hFile_, spaceToReserve); +} + +WinMmapFile::WinMmapFile(const std::string& fname, HANDLE hFile, + size_t page_size, size_t allocation_granularity, + const EnvOptions& options) + : WinFileData(fname, hFile, false), + WritableFile(options), + hMap_(NULL), + page_size_(page_size), + allocation_granularity_(allocation_granularity), + reserved_size_(0), + mapping_size_(0), + view_size_(0), + mapped_begin_(nullptr), + mapped_end_(nullptr), + dst_(nullptr), + last_sync_(nullptr), + file_offset_(0), + pending_sync_(false) { + // Allocation granularity must be obtained from GetSystemInfo() and must be + // a power of two. + assert(allocation_granularity > 0); + assert((allocation_granularity & (allocation_granularity - 1)) == 0); + + assert(page_size > 0); + assert((page_size & (page_size - 1)) == 0); + + // Only for memory mapped writes + assert(options.use_mmap_writes); + + // View size must be both the multiple of allocation_granularity AND the + // page size and the granularity is usually a multiple of a page size. + const size_t viewSize = 32 * 1024; // 32Kb similar to the Windows File Cache in buffered mode + view_size_ = Roundup(viewSize, allocation_granularity_); +} + +WinMmapFile::~WinMmapFile() { + if (hFile_) { + this->Close(); + } +} + +Status WinMmapFile::Append(const Slice& data) { + const char* src = data.data(); + size_t left = data.size(); + + while (left > 0) { + assert(mapped_begin_ <= dst_); + size_t avail = mapped_end_ - dst_; + + if (avail == 0) { + Status s = UnmapCurrentRegion(); + if (s.ok()) { + s = MapNewRegion(); + } + + if (!s.ok()) { + return s; + } + } else { + size_t n = std::min(left, avail); + memcpy(dst_, src, n); + dst_ += n; + src += n; + left -= n; + pending_sync_ = true; + } + } + + // Now make sure that the last partial page is padded with zeros if needed + size_t bytesToPad = Roundup(size_t(dst_), page_size_) - size_t(dst_); + if (bytesToPad > 0) { + memset(dst_, 0, bytesToPad); + } + + return Status::OK(); +} + +// Means Close() will properly take care of truncate +// and it does not need any additional information +Status WinMmapFile::Truncate(uint64_t size) { + return Status::OK(); +} + +Status WinMmapFile::Close() { + Status s; + + assert(NULL != hFile_); + + // We truncate to the precise size so no + // uninitialized data at the end. SetEndOfFile + // which we use does not write zeros and it is good. + uint64_t targetSize = GetFileSize(); + + if (mapped_begin_ != nullptr) { + // Sync before unmapping to make sure everything + // is on disk and there is not a lazy writing + // so we are deterministic with the tests + Sync(); + s = UnmapCurrentRegion(); + } + + if (NULL != hMap_) { + BOOL ret = ::CloseHandle(hMap_); + if (!ret && s.ok()) { + auto lastError = GetLastError(); + s = IOErrorFromWindowsError( + "Failed to Close mapping for file: " + filename_, lastError); + } + + hMap_ = NULL; + } + + if (hFile_ != NULL) { + + TruncateFile(targetSize); + + BOOL ret = ::CloseHandle(hFile_); + hFile_ = NULL; + + if (!ret && s.ok()) { + auto lastError = GetLastError(); + s = IOErrorFromWindowsError( + "Failed to close file map handle: " + filename_, lastError); + } + } + + return s; +} + +Status WinMmapFile::Flush() { return Status::OK(); } + +// Flush only data +Status WinMmapFile::Sync() { + Status s; + + // Some writes occurred since last sync + if (dst_ > last_sync_) { + assert(mapped_begin_); + assert(dst_); + assert(dst_ > mapped_begin_); + assert(dst_ < mapped_end_); + + size_t page_begin = + TruncateToPageBoundary(page_size_, last_sync_ - mapped_begin_); + size_t page_end = + TruncateToPageBoundary(page_size_, dst_ - mapped_begin_ - 1); + + // Flush only the amount of that is a multiple of pages + if (!::FlushViewOfFile(mapped_begin_ + page_begin, + (page_end - page_begin) + page_size_)) { + s = IOErrorFromWindowsError("Failed to FlushViewOfFile: " + filename_, + GetLastError()); + } else { + last_sync_ = dst_; + } + } + + return s; +} + +/** +* Flush data as well as metadata to stable storage. +*/ +Status WinMmapFile::Fsync() { + Status s = Sync(); + + // Flush metadata + if (s.ok() && pending_sync_) { + if (!::FlushFileBuffers(hFile_)) { + s = IOErrorFromWindowsError("Failed to FlushFileBuffers: " + filename_, + GetLastError()); + } + pending_sync_ = false; + } + + return s; +} + +/** +* Get the size of valid data in the file. This will not match the +* size that is returned from the filesystem because we use mmap +* to extend file by map_size every time. +*/ +uint64_t WinMmapFile::GetFileSize() { + size_t used = dst_ - mapped_begin_; + return file_offset_ + used; +} + +Status WinMmapFile::InvalidateCache(size_t offset, size_t length) { + return Status::OK(); +} + +Status WinMmapFile::Allocate(uint64_t offset, uint64_t len) { + Status status; + TEST_KILL_RANDOM("WinMmapFile::Allocate", rocksdb_kill_odds); + + // Make sure that we reserve an aligned amount of space + // since the reservation block size is driven outside so we want + // to check if we are ok with reservation here + size_t spaceToReserve = Roundup(static_cast(offset + len), view_size_); + // Nothing to do + if (spaceToReserve <= reserved_size_) { + return status; + } + + IOSTATS_TIMER_GUARD(allocate_nanos); + status = PreallocateInternal(spaceToReserve); + if (status.ok()) { + reserved_size_ = spaceToReserve; + } + return status; +} + +size_t WinMmapFile::GetUniqueId(char* id, size_t max_size) const { + return GetUniqueIdFromFile(hFile_, id, max_size); +} + +////////////////////////////////////////////////////////////////////////////////// +// WinSequentialFile + +WinSequentialFile::WinSequentialFile(const std::string& fname, HANDLE f, + const EnvOptions& options) + : WinFileData(fname, f, options.use_direct_reads) {} + +WinSequentialFile::~WinSequentialFile() { + assert(hFile_ != INVALID_HANDLE_VALUE); +} + +Status WinSequentialFile::Read(size_t n, Slice* result, char* scratch) { + Status s; + size_t r = 0; + + assert(result != nullptr); + if (WinFileData::use_direct_io()) { + return Status::NotSupported("Read() does not support direct_io"); + } + + // Windows ReadFile API accepts a DWORD. + // While it is possible to read in a loop if n is too big + // it is an unlikely case. + if (n > std::numeric_limits::max()) { + return Status::InvalidArgument("n is too big for a single ReadFile: " + + filename_); + } + + DWORD bytesToRead = static_cast(n); //cast is safe due to the check above + DWORD bytesRead = 0; + BOOL ret = ReadFile(hFile_, scratch, bytesToRead, &bytesRead, NULL); + if (ret != FALSE) { + r = bytesRead; + } else { + auto lastError = GetLastError(); + if (lastError != ERROR_HANDLE_EOF) { + s = IOErrorFromWindowsError("ReadFile failed: " + filename_, + lastError); + } + } + + *result = Slice(scratch, r); + return s; +} + +Status WinSequentialFile::PositionedReadInternal(char* src, size_t numBytes, + uint64_t offset, size_t& bytes_read) const { + return pread(this, src, numBytes, offset, bytes_read); +} + +Status WinSequentialFile::PositionedRead(uint64_t offset, size_t n, Slice* result, + char* scratch) { + + Status s; + + if (!WinFileData::use_direct_io()) { + return Status::NotSupported("This function is only used for direct_io"); + } + + if (!IsSectorAligned(static_cast(offset)) || + !IsSectorAligned(n)) { + return Status::InvalidArgument( + "WinSequentialFile::PositionedRead: offset is not properly aligned"); + } + + size_t bytes_read = 0; // out param + s = PositionedReadInternal(scratch, static_cast(n), offset, bytes_read); + *result = Slice(scratch, bytes_read); + return s; +} + + +Status WinSequentialFile::Skip(uint64_t n) { + // Can't handle more than signed max as SetFilePointerEx accepts a signed 64-bit + // integer. As such it is a highly unlikley case to have n so large. + if (n > static_cast(std::numeric_limits::max())) { + return Status::InvalidArgument("n is too large for a single SetFilePointerEx() call" + + filename_); + } + + LARGE_INTEGER li; + li.QuadPart = static_cast(n); //cast is safe due to the check above + BOOL ret = SetFilePointerEx(hFile_, li, NULL, FILE_CURRENT); + if (ret == FALSE) { + auto lastError = GetLastError(); + return IOErrorFromWindowsError("Skip SetFilePointerEx():" + filename_, + lastError); + } + return Status::OK(); +} + +Status WinSequentialFile::InvalidateCache(size_t offset, size_t length) { + return Status::OK(); +} + +////////////////////////////////////////////////////////////////////////////////////////////////// +/// WinRandomAccessBase + +inline +Status WinRandomAccessImpl::PositionedReadInternal(char* src, + size_t numBytes, + uint64_t offset, + size_t& bytes_read) const { + return pread(file_base_, src, numBytes, offset, bytes_read); +} + +inline +WinRandomAccessImpl::WinRandomAccessImpl(WinFileData* file_base, + size_t alignment, + const EnvOptions& options) : + file_base_(file_base), + alignment_(alignment) { + + assert(!options.use_mmap_reads); +} + +inline +Status WinRandomAccessImpl::ReadImpl(uint64_t offset, size_t n, Slice* result, + char* scratch) const { + + Status s; + + // Check buffer alignment + if (file_base_->use_direct_io()) { + if (!IsSectorAligned(static_cast(offset)) || + !IsAligned(alignment_, scratch)) { + return Status::InvalidArgument( + "WinRandomAccessImpl::ReadImpl: offset or scratch is not properly aligned"); + } + } + + if (n == 0) { + *result = Slice(scratch, 0); + return s; + } + + size_t bytes_read = 0; + s = PositionedReadInternal(scratch, n, offset, bytes_read); + *result = Slice(scratch, bytes_read); + return s; +} + +/////////////////////////////////////////////////////////////////////////////////////////////////// +/// WinRandomAccessFile + +WinRandomAccessFile::WinRandomAccessFile(const std::string& fname, HANDLE hFile, + size_t alignment, + const EnvOptions& options) + : WinFileData(fname, hFile, options.use_direct_reads), + WinRandomAccessImpl(this, alignment, options) {} + +WinRandomAccessFile::~WinRandomAccessFile() { +} + +Status WinRandomAccessFile::Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const { + return ReadImpl(offset, n, result, scratch); +} + +Status WinRandomAccessFile::InvalidateCache(size_t offset, size_t length) { + return Status::OK(); +} + +size_t WinRandomAccessFile::GetUniqueId(char* id, size_t max_size) const { + return GetUniqueIdFromFile(GetFileHandle(), id, max_size); +} + +size_t WinRandomAccessFile::GetRequiredBufferAlignment() const { + return GetAlignment(); +} + +///////////////////////////////////////////////////////////////////////////// +// WinWritableImpl +// + +inline +Status WinWritableImpl::PreallocateInternal(uint64_t spaceToReserve) { + return fallocate(file_data_->GetName(), file_data_->GetFileHandle(), spaceToReserve); +} + +inline +WinWritableImpl::WinWritableImpl(WinFileData* file_data, size_t alignment) + : file_data_(file_data), + alignment_(alignment), + next_write_offset_(0), + reservedsize_(0) { + + // Query current position in case ReopenWritableFile is called + // This position is only important for buffered writes + // for unbuffered writes we explicitely specify the position. + LARGE_INTEGER zero_move; + zero_move.QuadPart = 0; // Do not move + LARGE_INTEGER pos; + pos.QuadPart = 0; + BOOL ret = SetFilePointerEx(file_data_->GetFileHandle(), zero_move, &pos, + FILE_CURRENT); + // Querying no supped to fail + if (ret != 0) { + next_write_offset_ = pos.QuadPart; + } else { + assert(false); + } +} + +inline +Status WinWritableImpl::AppendImpl(const Slice& data) { + + Status s; + + if (data.size() > std::numeric_limits::max()) { + return Status::InvalidArgument("data is too long for a single write" + + file_data_->GetName()); + } + + size_t bytes_written = 0; // out param + + if (file_data_->use_direct_io()) { + // With no offset specified we are appending + // to the end of the file + assert(IsSectorAligned(next_write_offset_)); + if (!IsSectorAligned(data.size()) || + !IsAligned(static_cast(GetAlignement()), data.data())) { + s = Status::InvalidArgument( + "WriteData must be page aligned, size must be sector aligned"); + } else { + s = pwrite(file_data_, data, next_write_offset_, bytes_written); + } + } else { + + DWORD bytesWritten = 0; + if (!WriteFile(file_data_->GetFileHandle(), data.data(), + static_cast(data.size()), &bytesWritten, NULL)) { + auto lastError = GetLastError(); + s = IOErrorFromWindowsError( + "Failed to WriteFile: " + file_data_->GetName(), + lastError); + } else { + bytes_written = bytesWritten; + } + } + + if(s.ok()) { + if (bytes_written == data.size()) { + // This matters for direct_io cases where + // we rely on the fact that next_write_offset_ + // is sector aligned + next_write_offset_ += bytes_written; + } else { + s = Status::IOError("Failed to write all bytes: " + + file_data_->GetName()); + } + } + + return s; +} + +inline +Status WinWritableImpl::PositionedAppendImpl(const Slice& data, uint64_t offset) { + + if(file_data_->use_direct_io()) { + if (!IsSectorAligned(static_cast(offset)) || + !IsSectorAligned(data.size()) || + !IsAligned(static_cast(GetAlignement()), data.data())) { + return Status::InvalidArgument( + "Data and offset must be page aligned, size must be sector aligned"); + } + } + + size_t bytes_written = 0; + Status s = pwrite(file_data_, data, offset, bytes_written); + + if(s.ok()) { + if (bytes_written == data.size()) { + // For sequential write this would be simple + // size extension by data.size() + uint64_t write_end = offset + bytes_written; + if (write_end >= next_write_offset_) { + next_write_offset_ = write_end; + } + } else { + s = Status::IOError("Failed to write all of the requested data: " + + file_data_->GetName()); + } + } + return s; +} + +inline +Status WinWritableImpl::TruncateImpl(uint64_t size) { + + // It is tempting to check for the size for sector alignment + // but truncation may come at the end and there is not a requirement + // for this to be sector aligned so long as we do not attempt to write + // after that. The interface docs state that the behavior is undefined + // in that case. + Status s = ftruncate(file_data_->GetName(), file_data_->GetFileHandle(), + size); + + if (s.ok()) { + next_write_offset_ = size; + } + return s; +} + +inline +Status WinWritableImpl::CloseImpl() { + + Status s; + + auto hFile = file_data_->GetFileHandle(); + assert(INVALID_HANDLE_VALUE != hFile); + + if (!::FlushFileBuffers(hFile)) { + auto lastError = GetLastError(); + s = IOErrorFromWindowsError("FlushFileBuffers failed at Close() for: " + + file_data_->GetName(), + lastError); + } + + if(!file_data_->CloseFile() && s.ok()) { + auto lastError = GetLastError(); + s = IOErrorFromWindowsError("CloseHandle failed for: " + file_data_->GetName(), + lastError); + } + return s; +} + +inline +Status WinWritableImpl::SyncImpl() { + Status s; + if (!::FlushFileBuffers (file_data_->GetFileHandle())) { + auto lastError = GetLastError(); + s = IOErrorFromWindowsError( + "FlushFileBuffers failed at Sync() for: " + file_data_->GetName(), lastError); + } + return s; +} + + +inline +Status WinWritableImpl::AllocateImpl(uint64_t offset, uint64_t len) { + Status status; + TEST_KILL_RANDOM("WinWritableFile::Allocate", rocksdb_kill_odds); + + // Make sure that we reserve an aligned amount of space + // since the reservation block size is driven outside so we want + // to check if we are ok with reservation here + size_t spaceToReserve = Roundup(static_cast(offset + len), static_cast(alignment_)); + // Nothing to do + if (spaceToReserve <= reservedsize_) { + return status; + } + + IOSTATS_TIMER_GUARD(allocate_nanos); + status = PreallocateInternal(spaceToReserve); + if (status.ok()) { + reservedsize_ = spaceToReserve; + } + return status; +} + + +//////////////////////////////////////////////////////////////////////////////// +/// WinWritableFile + +WinWritableFile::WinWritableFile(const std::string& fname, HANDLE hFile, + size_t alignment, size_t /* capacity */, + const EnvOptions& options) + : WinFileData(fname, hFile, options.use_direct_writes), + WinWritableImpl(this, alignment), + WritableFile(options) { + assert(!options.use_mmap_writes); +} + +WinWritableFile::~WinWritableFile() { +} + +// Indicates if the class makes use of direct I/O +bool WinWritableFile::use_direct_io() const { return WinFileData::use_direct_io(); } + +size_t WinWritableFile::GetRequiredBufferAlignment() const { + return static_cast(GetAlignement()); +} + +Status WinWritableFile::Append(const Slice& data) { + return AppendImpl(data); +} + +Status WinWritableFile::PositionedAppend(const Slice& data, uint64_t offset) { + return PositionedAppendImpl(data, offset); +} + +// Need to implement this so the file is truncated correctly +// when buffered and unbuffered mode +Status WinWritableFile::Truncate(uint64_t size) { + return TruncateImpl(size); +} + +Status WinWritableFile::Close() { + return CloseImpl(); +} + + // write out the cached data to the OS cache + // This is now taken care of the WritableFileWriter +Status WinWritableFile::Flush() { + return Status::OK(); +} + +Status WinWritableFile::Sync() { + return SyncImpl(); +} + +Status WinWritableFile::Fsync() { return SyncImpl(); } + +bool WinWritableFile::IsSyncThreadSafe() const { return true; } + +uint64_t WinWritableFile::GetFileSize() { + return GetFileNextWriteOffset(); +} + +Status WinWritableFile::Allocate(uint64_t offset, uint64_t len) { + return AllocateImpl(offset, len); +} + +size_t WinWritableFile::GetUniqueId(char* id, size_t max_size) const { + return GetUniqueIdFromFile(GetFileHandle(), id, max_size); +} + +///////////////////////////////////////////////////////////////////////// +/// WinRandomRWFile + +WinRandomRWFile::WinRandomRWFile(const std::string& fname, HANDLE hFile, + size_t alignment, const EnvOptions& options) + : WinFileData(fname, hFile, + options.use_direct_reads && options.use_direct_writes), + WinRandomAccessImpl(this, alignment, options), + WinWritableImpl(this, alignment) {} + +bool WinRandomRWFile::use_direct_io() const { return WinFileData::use_direct_io(); } + +size_t WinRandomRWFile::GetRequiredBufferAlignment() const { + return static_cast(GetAlignement()); +} + +Status WinRandomRWFile::Write(uint64_t offset, const Slice & data) { + return PositionedAppendImpl(data, offset); +} + +Status WinRandomRWFile::Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const { + return ReadImpl(offset, n, result, scratch); +} + +Status WinRandomRWFile::Flush() { + return Status::OK(); +} + +Status WinRandomRWFile::Sync() { + return SyncImpl(); +} + +Status WinRandomRWFile::Close() { + return CloseImpl(); +} + +////////////////////////////////////////////////////////////////////////// +/// WinMemoryMappedBufer +WinMemoryMappedBuffer::~WinMemoryMappedBuffer() { + BOOL ret +#if defined(_MSC_VER) + = FALSE; +#else + __attribute__((__unused__)); +#endif + if (base_ != nullptr) { + ret = ::UnmapViewOfFile(base_); + assert(ret); + base_ = nullptr; + } + if (map_handle_ != NULL && map_handle_ != INVALID_HANDLE_VALUE) { + ret = ::CloseHandle(map_handle_); + assert(ret); + map_handle_ = NULL; + } + if (file_handle_ != NULL && file_handle_ != INVALID_HANDLE_VALUE) { + ret = ::CloseHandle(file_handle_); + assert(ret); + file_handle_ = NULL; + } +} + +////////////////////////////////////////////////////////////////////////// +/// WinDirectory + +Status WinDirectory::Fsync() { return Status::OK(); } + +size_t WinDirectory::GetUniqueId(char* id, size_t max_size) const { + return GetUniqueIdFromFile(handle_, id, max_size); +} +////////////////////////////////////////////////////////////////////////// +/// WinFileLock + +WinFileLock::~WinFileLock() { + BOOL ret __attribute__((__unused__)); + ret = ::CloseHandle(hFile_); + assert(ret); +} + +} +} // namespace ROCKSDB_NAMESPACE + +#endif diff --git a/storage/rocksdb/rocksdb/port/win/io_win.h b/storage/rocksdb/rocksdb/port/win/io_win.h new file mode 100644 index 000000000..d7aa7b483 --- /dev/null +++ b/storage/rocksdb/rocksdb/port/win/io_win.h @@ -0,0 +1,456 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#pragma once + +#include +#include +#include + +#include "rocksdb/status.h" +#include "rocksdb/env.h" +#include "util/aligned_buffer.h" + +#include + +namespace ROCKSDB_NAMESPACE { +namespace port { + +std::string GetWindowsErrSz(DWORD err); + +inline Status IOErrorFromWindowsError(const std::string& context, DWORD err) { + return ((err == ERROR_HANDLE_DISK_FULL) || (err == ERROR_DISK_FULL)) + ? Status::NoSpace(context, GetWindowsErrSz(err)) + : ((err == ERROR_FILE_NOT_FOUND) || (err == ERROR_PATH_NOT_FOUND)) + ? Status::PathNotFound(context, GetWindowsErrSz(err)) + : Status::IOError(context, GetWindowsErrSz(err)); +} + +inline Status IOErrorFromLastWindowsError(const std::string& context) { + return IOErrorFromWindowsError(context, GetLastError()); +} + +inline Status IOError(const std::string& context, int err_number) { + return (err_number == ENOSPC) + ? Status::NoSpace(context, strerror(err_number)) + : (err_number == ENOENT) + ? Status::PathNotFound(context, strerror(err_number)) + : Status::IOError(context, strerror(err_number)); +} + +class WinFileData; + +Status pwrite(const WinFileData* file_data, const Slice& data, + uint64_t offset, size_t& bytes_written); + +Status pread(const WinFileData* file_data, char* src, size_t num_bytes, + uint64_t offset, size_t& bytes_read); + +Status fallocate(const std::string& filename, HANDLE hFile, uint64_t to_size); + +Status ftruncate(const std::string& filename, HANDLE hFile, uint64_t toSize); + +size_t GetUniqueIdFromFile(HANDLE hFile, char* id, size_t max_size); + +class WinFileData { + protected: + const std::string filename_; + HANDLE hFile_; + // If true, the I/O issued would be direct I/O which the buffer + // will need to be aligned (not sure there is a guarantee that the buffer + // passed in is aligned). + const bool use_direct_io_; + + public: + // We want this class be usable both for inheritance (prive + // or protected) and for containment so __ctor and __dtor public + WinFileData(const std::string& filename, HANDLE hFile, bool direct_io) + : filename_(filename), hFile_(hFile), use_direct_io_(direct_io) {} + + virtual ~WinFileData() { this->CloseFile(); } + + bool CloseFile() { + bool result = true; + + if (hFile_ != NULL && hFile_ != INVALID_HANDLE_VALUE) { + result = ::CloseHandle(hFile_); + assert(result); + hFile_ = NULL; + } + return result; + } + + const std::string& GetName() const { return filename_; } + + HANDLE GetFileHandle() const { return hFile_; } + + bool use_direct_io() const { return use_direct_io_; } + + WinFileData(const WinFileData&) = delete; + WinFileData& operator=(const WinFileData&) = delete; +}; + +class WinSequentialFile : protected WinFileData, public SequentialFile { + + // Override for behavior change when creating a custom env + virtual Status PositionedReadInternal(char* src, size_t numBytes, + uint64_t offset, size_t& bytes_read) const; + +public: + WinSequentialFile(const std::string& fname, HANDLE f, + const EnvOptions& options); + + ~WinSequentialFile(); + + WinSequentialFile(const WinSequentialFile&) = delete; + WinSequentialFile& operator=(const WinSequentialFile&) = delete; + + virtual Status Read(size_t n, Slice* result, char* scratch) override; + virtual Status PositionedRead(uint64_t offset, size_t n, Slice* result, + char* scratch) override; + + virtual Status Skip(uint64_t n) override; + + virtual Status InvalidateCache(size_t offset, size_t length) override; + + virtual bool use_direct_io() const override { return WinFileData::use_direct_io(); } +}; + +// mmap() based random-access +class WinMmapReadableFile : private WinFileData, public RandomAccessFile { + HANDLE hMap_; + + const void* mapped_region_; + const size_t length_; + + public: + // mapped_region_[0,length-1] contains the mmapped contents of the file. + WinMmapReadableFile(const std::string& fileName, HANDLE hFile, HANDLE hMap, + const void* mapped_region, size_t length); + + ~WinMmapReadableFile(); + + WinMmapReadableFile(const WinMmapReadableFile&) = delete; + WinMmapReadableFile& operator=(const WinMmapReadableFile&) = delete; + + virtual Status Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const override; + + virtual Status InvalidateCache(size_t offset, size_t length) override; + + virtual size_t GetUniqueId(char* id, size_t max_size) const override; +}; + +// We preallocate and use memcpy to append new +// data to the file. This is safe since we either properly close the +// file before reading from it, or for log files, the reading code +// knows enough to skip zero suffixes. +class WinMmapFile : private WinFileData, public WritableFile { + private: + HANDLE hMap_; + + const size_t page_size_; // We flush the mapping view in page_size + // increments. We may decide if this is a memory + // page size or SSD page size + const size_t + allocation_granularity_; // View must start at such a granularity + + size_t reserved_size_; // Preallocated size + + size_t mapping_size_; // The max size of the mapping object + // we want to guess the final file size to minimize the remapping + size_t view_size_; // How much memory to map into a view at a time + + char* mapped_begin_; // Must begin at the file offset that is aligned with + // allocation_granularity_ + char* mapped_end_; + char* dst_; // Where to write next (in range [mapped_begin_,mapped_end_]) + char* last_sync_; // Where have we synced up to + + uint64_t file_offset_; // Offset of mapped_begin_ in file + + // Do we have unsynced writes? + bool pending_sync_; + + // Can only truncate or reserve to a sector size aligned if + // used on files that are opened with Unbuffered I/O + Status TruncateFile(uint64_t toSize); + + Status UnmapCurrentRegion(); + + Status MapNewRegion(); + + virtual Status PreallocateInternal(uint64_t spaceToReserve); + + public: + WinMmapFile(const std::string& fname, HANDLE hFile, size_t page_size, + size_t allocation_granularity, const EnvOptions& options); + + ~WinMmapFile(); + + WinMmapFile(const WinMmapFile&) = delete; + WinMmapFile& operator=(const WinMmapFile&) = delete; + + virtual Status Append(const Slice& data) override; + + // Means Close() will properly take care of truncate + // and it does not need any additional information + virtual Status Truncate(uint64_t size) override; + + virtual Status Close() override; + + virtual Status Flush() override; + + // Flush only data + virtual Status Sync() override; + + /** + * Flush data as well as metadata to stable storage. + */ + virtual Status Fsync() override; + + /** + * Get the size of valid data in the file. This will not match the + * size that is returned from the filesystem because we use mmap + * to extend file by map_size every time. + */ + virtual uint64_t GetFileSize() override; + + virtual Status InvalidateCache(size_t offset, size_t length) override; + + virtual Status Allocate(uint64_t offset, uint64_t len) override; + + virtual size_t GetUniqueId(char* id, size_t max_size) const override; +}; + +class WinRandomAccessImpl { + protected: + WinFileData* file_base_; + size_t alignment_; + + // Override for behavior change when creating a custom env + virtual Status PositionedReadInternal(char* src, size_t numBytes, + uint64_t offset, size_t& bytes_read) const; + + WinRandomAccessImpl(WinFileData* file_base, size_t alignment, + const EnvOptions& options); + + virtual ~WinRandomAccessImpl() {} + + Status ReadImpl(uint64_t offset, size_t n, Slice* result, + char* scratch) const; + + size_t GetAlignment() const { return alignment_; } + + public: + + WinRandomAccessImpl(const WinRandomAccessImpl&) = delete; + WinRandomAccessImpl& operator=(const WinRandomAccessImpl&) = delete; +}; + +// pread() based random-access +class WinRandomAccessFile + : private WinFileData, + protected WinRandomAccessImpl, // Want to be able to override + // PositionedReadInternal + public RandomAccessFile { + public: + WinRandomAccessFile(const std::string& fname, HANDLE hFile, size_t alignment, + const EnvOptions& options); + + ~WinRandomAccessFile(); + + virtual Status Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const override; + + virtual size_t GetUniqueId(char* id, size_t max_size) const override; + + virtual bool use_direct_io() const override { return WinFileData::use_direct_io(); } + + virtual Status InvalidateCache(size_t offset, size_t length) override; + + virtual size_t GetRequiredBufferAlignment() const override; +}; + +// This is a sequential write class. It has been mimicked (as others) after +// the original Posix class. We add support for unbuffered I/O on windows as +// well +// we utilize the original buffer as an alignment buffer to write directly to +// file with no buffering. +// No buffering requires that the provided buffer is aligned to the physical +// sector size (SSD page size) and +// that all SetFilePointer() operations to occur with such an alignment. +// We thus always write in sector/page size increments to the drive and leave +// the tail for the next write OR for Close() at which point we pad with zeros. +// No padding is required for +// buffered access. +class WinWritableImpl { + protected: + WinFileData* file_data_; + const uint64_t alignment_; + uint64_t next_write_offset_; // Needed because Windows does not support O_APPEND + uint64_t reservedsize_; // how far we have reserved space + + virtual Status PreallocateInternal(uint64_t spaceToReserve); + + WinWritableImpl(WinFileData* file_data, size_t alignment); + + ~WinWritableImpl() {} + + uint64_t GetAlignement() const { return alignment_; } + + Status AppendImpl(const Slice& data); + + // Requires that the data is aligned as specified by + // GetRequiredBufferAlignment() + Status PositionedAppendImpl(const Slice& data, uint64_t offset); + + Status TruncateImpl(uint64_t size); + + Status CloseImpl(); + + Status SyncImpl(); + + uint64_t GetFileNextWriteOffset() { + // Double accounting now here with WritableFileWriter + // and this size will be wrong when unbuffered access is used + // but tests implement their own writable files and do not use + // WritableFileWrapper + // so we need to squeeze a square peg through + // a round hole here. + return next_write_offset_; + } + + Status AllocateImpl(uint64_t offset, uint64_t len); + + public: + WinWritableImpl(const WinWritableImpl&) = delete; + WinWritableImpl& operator=(const WinWritableImpl&) = delete; +}; + +class WinWritableFile : private WinFileData, + protected WinWritableImpl, + public WritableFile { + public: + WinWritableFile(const std::string& fname, HANDLE hFile, size_t alignment, + size_t capacity, const EnvOptions& options); + + ~WinWritableFile(); + + virtual Status Append(const Slice& data) override; + + // Requires that the data is aligned as specified by + // GetRequiredBufferAlignment() + virtual Status PositionedAppend(const Slice& data, uint64_t offset) override; + + // Need to implement this so the file is truncated correctly + // when buffered and unbuffered mode + virtual Status Truncate(uint64_t size) override; + + virtual Status Close() override; + + // write out the cached data to the OS cache + // This is now taken care of the WritableFileWriter + virtual Status Flush() override; + + virtual Status Sync() override; + + virtual Status Fsync() override; + + virtual bool IsSyncThreadSafe() const override; + + // Indicates if the class makes use of direct I/O + // Use PositionedAppend + virtual bool use_direct_io() const override; + + virtual size_t GetRequiredBufferAlignment() const override; + + virtual uint64_t GetFileSize() override; + + virtual Status Allocate(uint64_t offset, uint64_t len) override; + + virtual size_t GetUniqueId(char* id, size_t max_size) const override; +}; + +class WinRandomRWFile : private WinFileData, + protected WinRandomAccessImpl, + protected WinWritableImpl, + public RandomRWFile { + public: + WinRandomRWFile(const std::string& fname, HANDLE hFile, size_t alignment, + const EnvOptions& options); + + ~WinRandomRWFile() {} + + // Indicates if the class makes use of direct I/O + // If false you must pass aligned buffer to Write() + virtual bool use_direct_io() const override; + + // Use the returned alignment value to allocate aligned + // buffer for Write() when use_direct_io() returns true + virtual size_t GetRequiredBufferAlignment() const override; + + // Write bytes in `data` at offset `offset`, Returns Status::OK() on success. + // Pass aligned buffer when use_direct_io() returns true. + virtual Status Write(uint64_t offset, const Slice& data) override; + + // Read up to `n` bytes starting from offset `offset` and store them in + // result, provided `scratch` size should be at least `n`. + // Returns Status::OK() on success. + virtual Status Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const override; + + virtual Status Flush() override; + + virtual Status Sync() override; + + virtual Status Fsync() override { return Sync(); } + + virtual Status Close() override; +}; + +class WinMemoryMappedBuffer : public MemoryMappedFileBuffer { +private: + HANDLE file_handle_; + HANDLE map_handle_; +public: + WinMemoryMappedBuffer(HANDLE file_handle, HANDLE map_handle, void* base, size_t size) : + MemoryMappedFileBuffer(base, size), + file_handle_(file_handle), + map_handle_(map_handle) {} + ~WinMemoryMappedBuffer() override; +}; + +class WinDirectory : public Directory { + HANDLE handle_; + public: + explicit WinDirectory(HANDLE h) noexcept : handle_(h) { + assert(handle_ != INVALID_HANDLE_VALUE); + } + ~WinDirectory() { + ::CloseHandle(handle_); + } + virtual Status Fsync() override; + + size_t GetUniqueId(char* id, size_t max_size) const override; +}; + +class WinFileLock : public FileLock { + public: + explicit WinFileLock(HANDLE hFile) : hFile_(hFile) { + assert(hFile != NULL); + assert(hFile != INVALID_HANDLE_VALUE); + } + + ~WinFileLock(); + + private: + HANDLE hFile_; +}; +} +} // namespace ROCKSDB_NAMESPACE -- Gitee