diff options
Diffstat (limited to 'src/backup.cpp')
| -rw-r--r-- | src/backup.cpp | 433 |
1 files changed, 370 insertions, 63 deletions
diff --git a/src/backup.cpp b/src/backup.cpp index bf8bfaf..4a76be5 100644 --- a/src/backup.cpp +++ b/src/backup.cpp @@ -1,4 +1,4 @@ -/* Copyright 2021 Ján Sučan <jan@jansucan.com> +/* Copyright 2024 Ján Sučan <jan@jansucan.com> * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are @@ -26,91 +26,398 @@ #include "backup.h" #include "buffered_file.h" +#include "format_v2.h" -#include <cstring> -#include <fstream> +#include <algorithm> +#include <array> +#include <cassert> +#include <iostream> +#include <vector> -static void check_files(const OptionsBackup &opts); +class Page +{ + friend bool operator==(const Page &lhs, const Page &rhs); + + public: + Page() : m_start(0), m_end(0){}; + + Page(std::shared_ptr<char[]> data, uint64_t start, uint64_t end) + : m_data(data), m_start(start), m_end(end) + { + assert(m_start <= m_end); + }; + + std::shared_ptr<char[]> getData() const { return m_data; }; + uint64_t getStart() const { return m_start; }; + uint64_t getEnd() const { return m_end; }; + size_t getSize() const { return m_end - m_start; }; + bool isEmpty() const { return getSize() == 0; }; + + private: + std::shared_ptr<char[]> m_data; + uint64_t m_start; + uint64_t m_end; +}; + +bool +operator==(const Page &lhs, const Page &rhs) +{ + return (lhs.m_data == rhs.m_data) && (lhs.m_start == rhs.m_start) && + (lhs.m_end == rhs.m_end); +} + +class PagedStreamReader +{ + public: + PagedStreamReader(std::istream &istr, size_t page_size_bytes) + : m_page_size_bytes(page_size_bytes), m_istream(istr), + m_stream_pos_bytes(0), m_buffer_index(0) + { + try { + m_buffers[0] = std::shared_ptr<char[]>(new char[m_page_size_bytes]); + m_buffers[1] = std::shared_ptr<char[]>(new char[m_page_size_bytes]); + } catch (const std::bad_alloc &e) { + throw BufferedFileError( + "cannot allocate pages for input stream data"); + } + }; + + Page getNextPage() + { + m_buffer_index = (m_buffer_index + 1) % 2; + auto buf = m_buffers[m_buffer_index]; + // Buffer for new data must not in use + assert(buf.use_count() == 2); + + const size_t bytes_read{readFromStream(buf.get())}; + m_stream_pos_bytes += bytes_read; + + if (bytes_read == 0) { + buf = std::shared_ptr<char[]>(); + } + return Page{buf, m_stream_pos_bytes - bytes_read, m_stream_pos_bytes}; + } + + private: + const size_t m_page_size_bytes; + std::istream &m_istream; + uint64_t m_stream_pos_bytes; + std::array<std::shared_ptr<char[]>, 2> m_buffers; + unsigned m_buffer_index; + + size_t readFromStream(char *const data) + { + if (m_istream.eof()) { + return 0; + } + + m_istream.read(data, m_page_size_bytes); -static void -check_files(const OptionsBackup &opts) + if (!m_istream.good() && !m_istream.eof()) { + throw BufferedFileError("cannot read from stream"); + } + + return m_istream.gcount(); + } +}; + +enum class MergeState { + Finished, + Incomplete, +}; + +class Diff +{ + friend MergeState diffsTryMerge(Diff &diff_a, Diff &diff_b, + size_t max_merge_gap, size_t max_size); + + public: + explicit Diff(uint64_t start_end) + : m_pages{}, m_start{start_end}, m_end{start_end} + { + assert(m_start <= m_end); + }; + Diff(Page page, uint64_t start, uint64_t end) + : m_pages{page}, m_start{start}, m_end{end} + { + assert(m_start <= m_end); + }; + uint64_t getStart() const { return m_start; }; + uint64_t getEnd() const { return m_end; }; + size_t getSize() const { return m_end - m_start; }; + bool isEmpty() const { return getSize() == 0; }; + + std::vector<FormatV2::RecordData> getData() const + { + std::vector<FormatV2::RecordData> data{}; + + if (!m_pages[0].isEmpty() && m_pages[1].isEmpty()) { + // Only the first page + assert((m_start >= m_pages[0].getStart()) && + (m_start <= m_pages[0].getEnd()) && + (m_end >= m_pages[0].getStart()) && + (m_end <= m_pages[0].getEnd())); + + const uint64_t offset{m_start - m_pages[0].getStart()}; + auto data_first{std::shared_ptr<char[]>{ + m_pages[0].getData(), + static_cast<char *>(m_pages[0].getData().get()) + offset}}; + data.push_back(FormatV2::RecordData{getSize(), data_first}); + } else if (!m_pages[0].isEmpty() && !m_pages[1].isEmpty()) { + // Both pages + assert((m_start >= m_pages[0].getStart()) && + (m_start <= m_pages[0].getEnd()) && + (m_end >= m_pages[1].getStart()) && + (m_end <= m_pages[1].getEnd())); + + size_t size{m_pages[0].getEnd() - m_start}; + const uint64_t offset{m_start - m_pages[0].getStart()}; + auto data_first{std::shared_ptr<char[]>{ + m_pages[0].getData(), + static_cast<char *>(m_pages[0].getData().get()) + offset}}; + data.push_back(FormatV2::RecordData{size, data_first}); + + size = m_end - m_pages[1].getStart(); + data.push_back(FormatV2::RecordData{size, m_pages[1].getData()}); + } + + return data; + }; + + private: + std::array<Page, 2> m_pages; + uint64_t m_start; + uint64_t m_end; + + bool hasPage(size_t i) // cppcheck-suppress unusedPrivateFunction + { + return (i < m_pages.size()) && (m_pages[i].getData() != nullptr); + }; +}; + +MergeState +diffsTryMerge(Diff &diff_a, Diff &diff_b, size_t max_merge_gap, size_t max_size) { - size_t in_size{0}; - try { - in_size = std::filesystem::file_size(opts.getInFilePath()); - } catch (const std::exception &e) { - throw BackupError("cannot get size of input file: " + - std::string(e.what())); + if (diff_a.isEmpty()) { + // Do not merge to an empty diff + return MergeState::Finished; } - size_t base_size{0}; - try { - base_size = std::filesystem::file_size(opts.getBaseFilePath()); - } catch (const std::exception &e) { - throw BackupError("cannot get size of base file: " + - std::string(e.what())); + if (diff_b.isEmpty()) { + // Nothing to merge from an empty diff + return MergeState::Finished; } - /* Check sizes of the input file and the base file */ - if (in_size != base_size) { - throw BackupError("input file and base file differ in size"); - } else if ((in_size % opts.getSectorSize()) != 0) { - throw BackupError( - "size of input file and base file is not multiple of " + - std::to_string(opts.getSectorSize())); + assert(diff_a.getEnd() <= diff_b.getStart()); + const size_t gap{diff_b.getStart() - diff_a.getEnd()}; + if (gap > max_merge_gap) { + // B is too far away + return MergeState::Finished; } + + if ((diff_a.getSize() + gap) >= max_size) { + // No space in A + return MergeState::Finished; + } + + // Can be merged + + // Adjust the diff start and end offsets + + // There is always at least 1 byte free in A here + const size_t free{max_size - (diff_a.getSize() + gap)}; + const size_t to_merge{std::min(free, diff_b.getSize())}; + // There is always at least 1 byte to merge from B here + + // Enlarge A + diff_a.m_end += gap + to_merge; + // Shrink B + diff_b.m_start += to_merge; + + // Add B's page to A if needed + + // Non-empty A must have only the first, or both pages + assert(diff_a.hasPage(0)); + // Non-empty B must have only the first page + assert(diff_b.hasPage(0) && !diff_b.hasPage(1)); + + // If A has both pages, B's page must only be the same as A's second + // page. No setting of pages in A is needed in this case + assert(!diff_a.hasPage(1) || (diff_b.m_pages[0] == diff_a.m_pages[1])); + if (!diff_a.hasPage(1)) { + // If A has only the first page, B's page must only be the same as the + // A's first page or following it + const bool b_follows{ + (diff_b.m_pages[0].getData() != diff_a.m_pages[0].getData()) && + (diff_b.m_pages[0].getStart() == diff_a.m_pages[0].getEnd())}; + assert((diff_b.m_pages[0] == diff_a.m_pages[0]) || b_follows); + if (b_follows) { + diff_a.m_pages[1] = diff_b.m_pages[0]; + } + } + + return (diff_a.getSize() >= max_size) ? MergeState::Finished + : MergeState::Incomplete; } +class DiffFinder +{ + public: + DiffFinder(std::istream &old_stream, std::istream &new_stream, + uint32_t buffer_size, size_t max_merge_gap) + : m_old_page_reader(old_stream, buffer_size), + m_new_page_reader(new_stream, buffer_size), + m_diff_max_size(buffer_size), m_max_merge_gap(max_merge_gap), + m_offset_in_stream(0), m_diff(0), + m_search_state(SearchState::ReadPages){}; + + Diff findNextDiff() + { + for (;;) { + if (m_search_state == SearchState::ReadPages) { + m_old_page = m_old_page_reader.getNextPage(); + m_new_page = m_new_page_reader.getNextPage(); + assert(m_old_page.getStart() == m_new_page.getStart()); + + if (m_old_page.getSize() != m_new_page.getSize()) { + throw BackupError( + "cannot read the same amount of data from both files"); + } + + const bool end_of_stream{m_old_page.isEmpty() && + m_new_page.isEmpty()}; + if (end_of_stream) { + const Diff return_diff{m_diff}; + m_diff = Diff{m_offset_in_stream}; + return return_diff; + } + + m_search_state = SearchState::FindDiff; + + } else if (m_search_state == SearchState::FindDiff) { + Diff diff{findDiffInPages(m_old_page, m_new_page, + m_offset_in_stream)}; + m_offset_in_stream = diff.getEnd(); + + if (diff.isEmpty()) { + // End of pages. On the next call, read new pages. + m_old_page = Page{}; + m_new_page = Page{}; + m_search_state = SearchState::ReadPages; + } + + const MergeState merge_state{diffsTryMerge( + m_diff, diff, m_max_merge_gap, m_diff_max_size)}; + + if (merge_state == MergeState::Finished) { + const Diff return_diff{m_diff}; + m_diff = diff; + if (!return_diff.isEmpty()) { + return return_diff; + } + } + + } else { + assert(false); + } + } + }; + + private: + enum class SearchState { ReadPages, FindDiff }; + + PagedStreamReader m_old_page_reader; + PagedStreamReader m_new_page_reader; + const size_t m_diff_max_size; + const size_t m_max_merge_gap; + Page m_old_page; + Page m_new_page; + uint64_t m_offset_in_stream; + Diff m_diff; + SearchState m_search_state; + + Diff findDiffInPages(Page old_page, Page new_page, + uint64_t offset_in_stream) + { + const char *old_data{old_page.getData().get()}; + const char *new_data{new_page.getData().get()}; + const uint64_t data_size_bytes{old_page.getSize()}; + + assert(offset_in_stream >= new_page.getStart()); + size_t offset_in_pages{offset_in_stream - new_page.getStart()}; + + // Find offset of the first different byte + for (; offset_in_pages < data_size_bytes; ++offset_in_pages) { + if (new_data[offset_in_pages] != old_data[offset_in_pages]) { + break; + } + } + const size_t start_in_pages{offset_in_pages}; + + if (offset_in_pages < data_size_bytes) { + // Different byte found. Start searching for a same byte immediately + // after. + ++offset_in_pages; + } + + // Find offset of the first same byte + for (; offset_in_pages < data_size_bytes; ++offset_in_pages) { + if (new_data[offset_in_pages] == old_data[offset_in_pages]) { + break; + } + } + const size_t end_in_pages{offset_in_pages}; + + // In the case when no different byte is found, the end offset will be + // the same as the start offset + + const uint64_t start_in_stream{new_page.getStart() + start_in_pages}; + const uint64_t end_in_stream{new_page.getStart() + end_in_pages}; + if (start_in_stream == end_in_stream) { + return Diff{start_in_stream}; + } else { + return Diff{new_page, start_in_stream, end_in_stream}; + } + } +}; + void backup(const OptionsBackup &opts) { - check_files(opts); - - BufferedFileReader in_file(opts.getInFilePath(), opts.getBufferSize()); - BufferedFileReader base_file(opts.getBaseFilePath(), opts.getBufferSize()); - BufferedFileWriter out_file(opts.getOutFilePath(), opts.getBufferSize()); + std::ifstream in_istream{opts.getInFilePath(), + std::ifstream::in | std::ifstream::binary}; + if (!in_istream) { + throw BufferedFileError("cannot open input file"); + } - std::unique_ptr<char[]> in_buffer; - try { - in_buffer = std::make_unique<char[]>(opts.getSectorSize()); - } catch (const std::bad_alloc &e) { - throw BackupError("cannot allocate sector buffer for input file data"); + std::ifstream base_istream{opts.getBaseFilePath(), + std::ifstream::in | std::ifstream::binary}; + if (!base_istream) { + throw BufferedFileError("cannot open base file"); } - std::unique_ptr<char[]> base_buffer; - try { - base_buffer = std::make_unique<char[]>(opts.getSectorSize()); - } catch (const std::bad_alloc &e) { - throw BackupError("cannot allocate sector buffer for base file data"); + // When backing up, the output file is truncated to hold the new data + std::ofstream out_ostream{opts.getOutFilePath(), std::ofstream::out | + std::ofstream::trunc | + std::ofstream::binary}; + if (!out_ostream) { + throw BufferedFileError("cannot open output file"); } - uint64_t input_file_offset{0}; + DiffFinder diff_finder(base_istream, in_istream, opts.getBufferSize(), + FormatV2::RecordHeaderSize); + FormatV2::Writer diff_writer(out_ostream, opts.getBufferSize()); + for (;;) { - // Read sectors - const size_t in_read_size = - in_file.read(in_buffer.get(), opts.getSectorSize()); - const size_t base_read_size = - base_file.read(base_buffer.get(), opts.getSectorSize()); - - if (in_read_size != base_read_size) { - throw BackupError( - "cannot read equal amount of bytes from the input files"); - } else if (in_read_size == 0) { + const Diff diff{diff_finder.findNextDiff()}; + if (diff.isEmpty()) { break; - } else if (in_read_size != opts.getSectorSize()) { - throw BackupError("cannot read full sectors from the input files"); } - // Check for difference - const bool differ = (memcmp(in_buffer.get(), base_buffer.get(), - opts.getSectorSize()) != 0); - if (differ) { - // Backup sector - uint64_t o = htole64(input_file_offset); - out_file.write(reinterpret_cast<char *>(&o), sizeof(o)); - out_file.write(in_buffer.get(), opts.getSectorSize()); - } + diff_writer.writeDiffRecord(diff.getStart(), diff.getSize(), + diff.getData()); - input_file_offset += opts.getSectorSize(); + // Here, the diff is destructed and page data reference counters + // decremented } } |
