diff options
| author | Jan Sucan <jan@jansucan.com> | 2025-01-22 17:04:24 +0100 |
|---|---|---|
| committer | Ján Sučan <jan@jansucan.com> | 2025-01-22 17:12:56 +0100 |
| commit | 722a52b94e4d8ae73bc4ee92472dba8e7174a1ab (patch) | |
| tree | a13f6dabaf9f225dfb4c8352b7bcbcb39c97b38f /src/backup.cpp | |
| parent | cf8eac7777e81590b24e8d285560188e31035c16 (diff) | |
Rename backup operation to create
Diffstat (limited to 'src/backup.cpp')
| -rw-r--r-- | src/backup.cpp | 391 |
1 files changed, 0 insertions, 391 deletions
diff --git a/src/backup.cpp b/src/backup.cpp deleted file mode 100644 index 98898b9..0000000 --- a/src/backup.cpp +++ /dev/null @@ -1,391 +0,0 @@ -/* Copyright 2024 Ján Sučan <jan@jansucan.com> - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are - * met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#include "backup.h" -#include "buffered_stream.h" -#include "format_v2.h" - -#include <algorithm> -#include <array> -#include <cassert> -#include <iostream> -#include <vector> - -class Page -{ - friend bool operator==(const Page &lhs, const Page &rhs); - - public: - Page() : m_start(0), m_end(0){}; - - Page(std::shared_ptr<char[]> data, uint64_t start, uint64_t end) - : m_data(data), m_start(start), m_end(end) - { - assert(m_start <= m_end); - }; - - std::shared_ptr<char[]> getData() const { return m_data; }; - uint64_t getStart() const { return m_start; }; - uint64_t getEnd() const { return m_end; }; - size_t getSize() const { return m_end - m_start; }; - bool isEmpty() const { return getSize() == 0; }; - - private: - std::shared_ptr<char[]> m_data; - uint64_t m_start; - uint64_t m_end; -}; - -bool -operator==(const Page &lhs, const Page &rhs) -{ - return (lhs.m_data == rhs.m_data) && (lhs.m_start == rhs.m_start) && - (lhs.m_end == rhs.m_end); -} - -class PagedStreamReader -{ - public: - PagedStreamReader(std::istream &istr, size_t page_size_bytes) - : m_page_size_bytes(page_size_bytes), - m_reader(istr, page_size_bytes, 2), m_stream_pos_bytes(0){}; - - Page getNextPage() - { - const BufferedStream::DataPart dp{ - m_reader.readMultipart(m_page_size_bytes)}; - - m_stream_pos_bytes += dp.size; - - return Page{dp.data, m_stream_pos_bytes - dp.size, m_stream_pos_bytes}; - } - - private: - const size_t m_page_size_bytes; - BufferedStream::Reader m_reader; - uint64_t m_stream_pos_bytes; -}; - -enum class MergeState { - Finished, - Incomplete, -}; - -class Diff -{ - friend MergeState diffsTryMerge(Diff &diff_a, Diff &diff_b, - size_t max_merge_gap, size_t max_size); - - public: - explicit Diff(uint64_t start_end) - : m_pages{}, m_start{start_end}, m_end{start_end} - { - assert(m_start <= m_end); - }; - Diff(Page page, uint64_t start, uint64_t end) - : m_pages{page}, m_start{start}, m_end{end} - { - assert(m_start <= m_end); - }; - uint64_t getStart() const { return m_start; }; - uint64_t getEnd() const { return m_end; }; - size_t getSize() const { return m_end - m_start; }; - bool isEmpty() const { return getSize() == 0; }; - - std::vector<FormatV2::RecordData> getData() const - { - std::vector<FormatV2::RecordData> data{}; - - if (!m_pages[0].isEmpty() && m_pages[1].isEmpty()) { - // Only the first page - assert((m_start >= m_pages[0].getStart()) && - (m_start <= m_pages[0].getEnd()) && - (m_end >= m_pages[0].getStart()) && - (m_end <= m_pages[0].getEnd())); - - const uint64_t offset{m_start - m_pages[0].getStart()}; - auto data_first{std::shared_ptr<char[]>{ - m_pages[0].getData(), - static_cast<char *>(m_pages[0].getData().get()) + offset}}; - data.push_back(FormatV2::RecordData{getSize(), data_first}); - } else if (!m_pages[0].isEmpty() && !m_pages[1].isEmpty()) { - // Both pages - assert((m_start >= m_pages[0].getStart()) && - (m_start <= m_pages[0].getEnd()) && - (m_end >= m_pages[1].getStart()) && - (m_end <= m_pages[1].getEnd())); - - size_t size{m_pages[0].getEnd() - m_start}; - const uint64_t offset{m_start - m_pages[0].getStart()}; - auto data_first{std::shared_ptr<char[]>{ - m_pages[0].getData(), - static_cast<char *>(m_pages[0].getData().get()) + offset}}; - data.push_back(FormatV2::RecordData{size, data_first}); - - size = m_end - m_pages[1].getStart(); - data.push_back(FormatV2::RecordData{size, m_pages[1].getData()}); - } - - return data; - }; - - private: - std::array<Page, 2> m_pages; - uint64_t m_start; - uint64_t m_end; - - bool hasPage(size_t i) // cppcheck-suppress unusedPrivateFunction - { - return (i < m_pages.size()) && (m_pages[i].getData() != nullptr); - }; -}; - -MergeState -diffsTryMerge(Diff &diff_a, Diff &diff_b, size_t max_merge_gap, size_t max_size) -{ - if (diff_a.isEmpty()) { - // Do not merge to an empty diff - return MergeState::Finished; - } - - if (diff_b.isEmpty()) { - // Nothing to merge from an empty diff - return MergeState::Finished; - } - - assert(diff_a.getEnd() <= diff_b.getStart()); - const size_t gap{diff_b.getStart() - diff_a.getEnd()}; - if (gap > max_merge_gap) { - // B is too far away - return MergeState::Finished; - } - - if ((diff_a.getSize() + gap) >= max_size) { - // No space in A - return MergeState::Finished; - } - - // Can be merged - - // Adjust the diff start and end offsets - - // There is always at least 1 byte free in A here - const size_t free{max_size - (diff_a.getSize() + gap)}; - const size_t to_merge{std::min(free, diff_b.getSize())}; - // There is always at least 1 byte to merge from B here - - // Enlarge A - diff_a.m_end += gap + to_merge; - // Shrink B - diff_b.m_start += to_merge; - - // Add B's page to A if needed - - // Non-empty A must have only the first, or both pages - assert(diff_a.hasPage(0)); - // Non-empty B must have only the first page - assert(diff_b.hasPage(0) && !diff_b.hasPage(1)); - - // If A has both pages, B's page must only be the same as A's second - // page. No setting of pages in A is needed in this case - assert(!diff_a.hasPage(1) || (diff_b.m_pages[0] == diff_a.m_pages[1])); - if (!diff_a.hasPage(1)) { - // If A has only the first page, B's page must only be the same as the - // A's first page or following it - const bool b_follows{ - (diff_b.m_pages[0].getData() != diff_a.m_pages[0].getData()) && - (diff_b.m_pages[0].getStart() == diff_a.m_pages[0].getEnd())}; - assert((diff_b.m_pages[0] == diff_a.m_pages[0]) || b_follows); - if (b_follows) { - diff_a.m_pages[1] = diff_b.m_pages[0]; - } - } - - return (diff_a.getSize() >= max_size) ? MergeState::Finished - : MergeState::Incomplete; -} - -class DiffFinder -{ - public: - DiffFinder(std::istream &old_stream, std::istream &new_stream, - uint32_t buffer_size, size_t max_merge_gap) - : m_old_page_reader(old_stream, buffer_size), - m_new_page_reader(new_stream, buffer_size), - m_diff_max_size(buffer_size), m_max_merge_gap(max_merge_gap), - m_offset_in_stream(0), m_diff(0), - m_search_state(SearchState::ReadPages){}; - - Diff findNextDiff() - { - for (;;) { - if (m_search_state == SearchState::ReadPages) { - m_old_page = m_old_page_reader.getNextPage(); - m_new_page = m_new_page_reader.getNextPage(); - assert(m_old_page.getStart() == m_new_page.getStart()); - - if (m_old_page.getSize() != m_new_page.getSize()) { - throw BackupError( - "cannot read the same amount of data from both files"); - } - - const bool end_of_stream{m_old_page.isEmpty() && - m_new_page.isEmpty()}; - if (end_of_stream) { - const Diff return_diff{m_diff}; - m_diff = Diff{m_offset_in_stream}; - return return_diff; - } - - m_search_state = SearchState::FindDiff; - - } else if (m_search_state == SearchState::FindDiff) { - Diff diff{findDiffInPages(m_old_page, m_new_page, - m_offset_in_stream)}; - m_offset_in_stream = diff.getEnd(); - - if (diff.isEmpty()) { - // End of pages. On the next call, read new pages. - m_old_page = Page{}; - m_new_page = Page{}; - m_search_state = SearchState::ReadPages; - } - - const MergeState merge_state{diffsTryMerge( - m_diff, diff, m_max_merge_gap, m_diff_max_size)}; - - if (merge_state == MergeState::Finished) { - const Diff return_diff{m_diff}; - m_diff = diff; - if (!return_diff.isEmpty()) { - return return_diff; - } - } - - } else { - assert(false); - } - } - }; - - private: - enum class SearchState { ReadPages, FindDiff }; - - PagedStreamReader m_old_page_reader; - PagedStreamReader m_new_page_reader; - const size_t m_diff_max_size; - const size_t m_max_merge_gap; - Page m_old_page; - Page m_new_page; - uint64_t m_offset_in_stream; - Diff m_diff; - SearchState m_search_state; - - Diff findDiffInPages(Page old_page, Page new_page, - uint64_t offset_in_stream) - { - const char *old_data{old_page.getData().get()}; - const char *new_data{new_page.getData().get()}; - const uint64_t data_size_bytes{old_page.getSize()}; - - assert(offset_in_stream >= new_page.getStart()); - size_t offset_in_pages{offset_in_stream - new_page.getStart()}; - - // Find offset of the first different byte - for (; offset_in_pages < data_size_bytes; ++offset_in_pages) { - if (new_data[offset_in_pages] != old_data[offset_in_pages]) { - break; - } - } - const size_t start_in_pages{offset_in_pages}; - - if (offset_in_pages < data_size_bytes) { - // Different byte found. Start searching for a same byte immediately - // after. - ++offset_in_pages; - } - - // Find offset of the first same byte - for (; offset_in_pages < data_size_bytes; ++offset_in_pages) { - if (new_data[offset_in_pages] == old_data[offset_in_pages]) { - break; - } - } - const size_t end_in_pages{offset_in_pages}; - - // In the case when no different byte is found, the end offset will be - // the same as the start offset - - const uint64_t start_in_stream{new_page.getStart() + start_in_pages}; - const uint64_t end_in_stream{new_page.getStart() + end_in_pages}; - if (start_in_stream == end_in_stream) { - return Diff{start_in_stream}; - } else { - return Diff{new_page, start_in_stream, end_in_stream}; - } - } -}; - -void -backup(const OptionsBackup &opts) -{ - std::ifstream in_istream{opts.getInFilePath(), - std::ifstream::in | std::ifstream::binary}; - if (!in_istream) { - throw BufferedStream::Error("cannot open input file"); - } - - std::ifstream base_istream{opts.getBaseFilePath(), - std::ifstream::in | std::ifstream::binary}; - if (!base_istream) { - throw BufferedStream::Error("cannot open base file"); - } - - // When backing up, the output file is truncated to hold the new data - std::ofstream out_ostream{opts.getOutFilePath(), std::ofstream::out | - std::ofstream::trunc | - std::ofstream::binary}; - if (!out_ostream) { - throw BufferedStream::Error("cannot open output file"); - } - - DiffFinder diff_finder(base_istream, in_istream, opts.getBufferSize(), - FormatV2::RecordHeaderSize); - FormatV2::Writer diff_writer(out_ostream, opts.getBufferSize()); - - for (;;) { - const Diff diff{diff_finder.findNextDiff()}; - if (diff.isEmpty()) { - break; - } - - diff_writer.writeDiffRecord(diff.getStart(), diff.getSize(), - diff.getData()); - - // Here, the diff is destructed and page data reference counters - // decremented - } -} |
