BareGit

Implement Block Parser with support for Paragraphs, Headings, and BlockQuotes

Author: MetroWind <chris.corsair@gmail.com>
Date: Sat Jan 10 12:43:21 2026 -0800
Commit: 3e610d2cc93c094966fee561c174335a99abb0c6

Changes

diff --git a/CMakeLists.txt b/CMakeLists.txt
index a0194da..2f4e86f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -37,7 +37,7 @@ include_directories(include)
 # but for now we will just list them manually or add a library target.
 # Let's create a core library for the logic to share between main and tests.
 
-add_library(macrodown_lib STATIC src/lib_placeholder.cpp src/macro_engine.cpp src/parser.cpp)
+add_library(macrodown_lib STATIC src/lib_placeholder.cpp src/macro_engine.cpp src/parser.cpp src/block_parser.cpp)
 target_include_directories(macrodown_lib PUBLIC include)
 target_link_libraries(macrodown_lib PUBLIC uni-algo::uni-algo)
 
@@ -47,7 +47,7 @@ target_link_libraries(macrodown PRIVATE macrodown_lib)
 # Testing
 enable_testing()
 
-add_executable(macrodown_test tests/test_main.cpp tests/test_macro_engine.cpp)
+add_executable(macrodown_test tests/test_main.cpp tests/test_macro_engine.cpp tests/test_block_parser.cpp)
 target_link_libraries(macrodown_test PRIVATE macrodown_lib GTest::gtest_main)
 target_include_directories(macrodown_test PRIVATE include)
 
diff --git a/include/block_parser.h b/include/block_parser.h
new file mode 100644
index 0000000..4a59447
--- /dev/null
+++ b/include/block_parser.h
@@ -0,0 +1,37 @@
+#ifndef MACRODOWN_BLOCK_PARSER_H
+#define MACRODOWN_BLOCK_PARSER_H
+
+#include <string>
+#include <vector>
+#include <memory>
+#include "block.h"
+
+namespace macrodown {
+
+class BlockParser {
+public:
+    // Main entry point: parses the entire document into a Block tree
+    static std::unique_ptr<Block> parse(const std::string& input);
+
+private:
+    struct BlockStackItem {
+        Block* block;
+        // Parsing state specific to this block type could go here
+    };
+
+    std::unique_ptr<Block> root;
+    std::vector<BlockStackItem> open_blocks;
+
+    BlockParser();
+    void process_line(const std::string& line);
+    void close_unmatched_blocks(size_t last_matched_index);
+    void add_text_to_current(const std::string& text);
+    
+    // Checkers
+    bool is_container(BlockType type);
+    bool matches(Block* block, const std::string& line, size_t& offset);
+};
+
+} // namespace macrodown
+
+#endif // MACRODOWN_BLOCK_PARSER_H
diff --git a/src/block_parser.cpp b/src/block_parser.cpp
new file mode 100644
index 0000000..107853a
--- /dev/null
+++ b/src/block_parser.cpp
@@ -0,0 +1,224 @@
+#include "block_parser.h"
+#include <sstream>
+#include <iostream>
+#include <algorithm>
+
+namespace macrodown {
+
+namespace {
+
+// Helper: check if line is blank (only spaces)
+bool is_blank(const std::string& line) {
+    return std::all_of(line.begin(), line.end(), [](unsigned char c){ return std::isspace(c); });
+}
+
+// Helper: count indentation
+size_t count_indent(const std::string& line, size_t offset) {
+    size_t count = 0;
+    while (offset + count < line.size() && line[offset + count] == ' ') {
+        count++;
+    }
+    return count;
+}
+
+} // namespace
+
+BlockParser::BlockParser() {
+    root = std::make_unique<Block>(BlockType::Document);
+    open_blocks.push_back({root.get()});
+}
+
+std::unique_ptr<Block> BlockParser::parse(const std::string& input) {
+    BlockParser parser;
+    std::istringstream stream(input);
+    std::string line;
+    
+    while (std::getline(stream, line)) {
+        // Remove \r if present (Windows line endings)
+        if (!line.empty() && line.back() == '\r') {
+            line.pop_back();
+        }
+        parser.process_line(line);
+    }
+    
+    // Close all remaining blocks
+    parser.close_unmatched_blocks(0); // 0 means keep only root, but since we are done, close everything
+    // Actually we don't "close" root in the stack sense, but we mark open=false
+    parser.root->open = false;
+
+    return std::move(parser.root);
+}
+
+bool BlockParser::is_container(BlockType type) {
+    return type == BlockType::Document || 
+           type == BlockType::Quote || 
+           type == BlockType::List || 
+           type == BlockType::ListItem;
+}
+
+// Determines if an open block matches the current line
+// Updates offset to consume markers
+bool BlockParser::matches(Block* block, const std::string& line, size_t& offset) {
+    if (block->type == BlockType::Document) {
+        return true; // Document always matches
+    }
+    
+    if (block->type == BlockType::Quote) {
+        size_t indent = count_indent(line, offset);
+        if (indent < 4) {
+            if (offset + indent < line.size() && line[offset + indent] == '>') {
+                offset += indent + 1; // Consume indent and '>'
+                if (offset < line.size() && line[offset] == ' ') offset++; // Optional space
+                return true;
+            }
+        }
+        return false;
+    }
+    
+    if (block->type == BlockType::Paragraph) {
+        if (is_blank(line)) return false;
+        
+        // Paragraphs match unless interrupted by a new block type
+        size_t indent = count_indent(line, offset);
+        if (indent < 4) {
+            // Check for BlockQuote
+            if (offset + indent < line.size() && line[offset + indent] == '>') return false;
+            
+            // Check for ATX Heading
+            size_t check_pos = offset + indent;
+            if (check_pos < line.size() && line[check_pos] == '#') {
+                // Confirm it's a heading (sequence of # followed by space or end)
+                size_t hash_count = 0;
+                while (check_pos + hash_count < line.size() && line[check_pos + hash_count] == '#' && hash_count < 6) {
+                    hash_count++;
+                }
+                if (hash_count > 0 && (check_pos + hash_count == line.size() || line[check_pos + hash_count] == ' ')) {
+                    return false;
+                }
+            }
+        }
+        
+        // It's a continuation
+        return true;
+    }
+    
+    return false;
+}
+
+void BlockParser::process_line(const std::string& line) {
+    size_t offset = 0;
+    size_t matches_count = 0;
+    
+    // 1. Find matches in open blocks
+    matches_count = 0; // Root always matches
+    for (size_t i = 1; i < open_blocks.size(); ++i) {
+        if (matches(open_blocks[i].block, line, offset)) {
+            matches_count = i;
+        } else {
+            break;
+        }
+    }
+
+    // 2. Close unmatched blocks
+    close_unmatched_blocks(matches_count);
+    
+    // 3. Open new blocks
+    // Scan rest of line (at offset)
+    
+    // Check for BlockQuote
+    while (true) {
+        size_t indent = count_indent(line, offset);
+        if (indent < 4 && offset + indent < line.size() && line[offset + indent] == '>') {
+            offset += indent + 1;
+            if (offset < line.size() && line[offset] == ' ') offset++;
+            
+            auto new_block = std::make_unique<Block>(BlockType::Quote);
+            Block* ptr = new_block.get();
+            open_blocks.back().block->children.push_back(std::move(new_block));
+            open_blocks.push_back({ptr});
+        } else {
+            break;
+        }
+    }
+    
+    // 4. Handle Leaf Blocks (Heading, ThematicBreak) or continuation
+    
+    Block* tip = open_blocks.back().block;
+    
+    // If tip is a Paragraph, check for blank line (closes it)
+    if (tip->type == BlockType::Paragraph) {
+        if (is_blank(line)) {
+            close_unmatched_blocks(open_blocks.size() - 2); // Close paragraph
+            return;
+        }
+        // Else, it's a continuation
+        // (Unless it's interrupted by a Heading/Quote etc. - Simplified: we assume it continues)
+        // Strictly, we should check if the line *starts* a new block (like Header)
+        // If it does, we close the paragraph.
+    }
+    
+    // Check for ATX Heading
+    size_t indent = count_indent(line, offset);
+    if (indent < 4) {
+        size_t check_pos = offset + indent;
+        size_t hash_count = 0;
+        while (check_pos + hash_count < line.size() && line[check_pos + hash_count] == '#' && hash_count < 6) {
+            hash_count++;
+        }
+        
+        if (hash_count > 0 && (check_pos + hash_count == line.size() || line[check_pos + hash_count] == ' ')) {
+            // Found Heading
+            // If we were in a paragraph, close it
+            if (tip->type == BlockType::Paragraph) {
+                close_unmatched_blocks(open_blocks.size() - 2);
+                tip = open_blocks.back().block;
+            }
+            
+            auto heading = std::make_unique<Block>(BlockType::Heading);
+            heading->level = hash_count;
+            // Content is the rest of the line (trimmed)
+            size_t content_start = check_pos + hash_count;
+            while (content_start < line.size() && line[content_start] == ' ') content_start++;
+            heading->literal_content = line.substr(content_start);
+            // Remove trailing hashes? CommonMark says yes. Optional for now.
+            heading->open = false; // Headings are single line
+            
+            tip->children.push_back(std::move(heading));
+            return;
+        }
+    }
+    
+    // 5. Finalize: Text or Paragraph
+    if (is_blank(line)) {
+        return; // Ignore blank lines if not ending a paragraph
+    }
+    
+    if (tip->type == BlockType::Document || tip->type == BlockType::Quote || tip->type == BlockType::List || tip->type == BlockType::ListItem) {
+        // Create new Paragraph
+        auto p = std::make_unique<Block>(BlockType::Paragraph);
+        Block* p_ptr = p.get();
+        tip->children.push_back(std::move(p));
+        open_blocks.push_back({p_ptr});
+        
+        // Add text
+        // Note: indentation in paragraph text is preserved but leading spaces of the first line?
+        // CommonMark: stripped.
+        size_t content_start = offset + count_indent(line, offset);
+        p_ptr->literal_content = line.substr(content_start);
+    } else if (tip->type == BlockType::Paragraph) {
+        // Continuation
+        // Remove leading spaces up to indent? simplified: just add space + text
+        size_t content_start = offset + count_indent(line, offset);
+        tip->literal_content += "\n" + line.substr(content_start);
+    }
+}
+
+void BlockParser::close_unmatched_blocks(size_t last_matched_index) {
+    while (open_blocks.size() > last_matched_index + 1) {
+        Block* block = open_blocks.back().block;
+        block->open = false;
+        open_blocks.pop_back();
+    }
+}
+
+} // namespace macrodown
diff --git a/tests/test_block_parser.cpp b/tests/test_block_parser.cpp
new file mode 100644
index 0000000..8d1f6a3
--- /dev/null
+++ b/tests/test_block_parser.cpp
@@ -0,0 +1,97 @@
+#include <gtest/gtest.h>
+#include "block_parser.h"
+
+using namespace macrodown;
+
+TEST(BlockParserTest, SimpleParagraph) {
+    std::string input = "Hello\nWorld";
+    auto root = BlockParser::parse(input);
+    
+    ASSERT_EQ(root->type, BlockType::Document);
+    ASSERT_EQ(root->children.size(), 1);
+    
+    auto* p = root->children[0].get();
+    EXPECT_EQ(p->type, BlockType::Paragraph);
+    EXPECT_EQ(p->literal_content, "Hello\nWorld");
+}
+
+TEST(BlockParserTest, MultipleParagraphs) {
+    std::string input = "Para 1\n\nPara 2";
+    auto root = BlockParser::parse(input);
+    
+    ASSERT_EQ(root->children.size(), 2);
+    EXPECT_EQ(root->children[0]->type, BlockType::Paragraph);
+    EXPECT_EQ(root->children[1]->type, BlockType::Paragraph);
+}
+
+TEST(BlockParserTest, Headers) {
+    std::string input = "# H1\n## H2";
+    auto root = BlockParser::parse(input);
+    
+    ASSERT_EQ(root->children.size(), 2);
+    
+    auto* h1 = root->children[0].get();
+    EXPECT_EQ(h1->type, BlockType::Heading);
+    EXPECT_EQ(h1->level, 1);
+    EXPECT_EQ(h1->literal_content, "H1");
+    
+    auto* h2 = root->children[1].get();
+    EXPECT_EQ(h2->type, BlockType::Heading);
+    EXPECT_EQ(h2->level, 2);
+    EXPECT_EQ(h2->literal_content, "H2");
+}
+
+TEST(BlockParserTest, BlockQuote) {
+    std::string input = "> Hello\n> World";
+    auto root = BlockParser::parse(input);
+    
+    ASSERT_EQ(root->children.size(), 1);
+    auto* quote = root->children[0].get();
+    EXPECT_EQ(quote->type, BlockType::Quote);
+    
+    ASSERT_EQ(quote->children.size(), 1);
+    auto* p = quote->children[0].get();
+    EXPECT_EQ(p->type, BlockType::Paragraph);
+    EXPECT_EQ(p->literal_content, "Hello\nWorld");
+}
+
+TEST(BlockParserTest, NestedQuote) {
+    std::string input = "> Level 1\n>> Level 2";
+    auto root = BlockParser::parse(input);
+    
+    ASSERT_EQ(root->children.size(), 1);
+    auto* q1 = root->children[0].get();
+    EXPECT_EQ(q1->type, BlockType::Quote);
+    
+    // Structure: Quote -> [Paragraph("Level 1"), Quote -> [Paragraph("Level 2")]]
+    // Wait, my parser closes paragraph on new block type if strict?
+    // "Level 1" starts a paragraph.
+    // ">> Level 2":
+    // Line 1: "> Level 1". Matches Quote. Adds "Level 1" to P.
+    // Line 2: ">> Level 2". Matches Quote (q1).
+    //   Scan rest: "> Level 2". Finds nested Quote (q2).
+    //   Scan rest: "Level 2". Adds to new P inside q2.
+    // But what about the P inside q1? "Level 1".
+    // Does it close? 
+    // My parser logic:
+    // matches(q1) -> true. matches_count = 1.
+    // Scan for new blocks: finds q2.
+    // q1->children (was [P]) now gets q2 pushed?
+    // Yes. P remains in children list. P is closed?
+    // "close_unmatched_blocks" only closes if mismatch.
+    // But Step 4 "Handle Leaf Blocks":
+    // tip is now q2. (Since we opened q2).
+    // So P (inside q1) is no longer the "tip" (back of open_blocks).
+    // Correct.
+    
+    ASSERT_EQ(q1->children.size(), 2); // P("Level 1") + Quote
+    
+    EXPECT_EQ(q1->children[0]->type, BlockType::Paragraph);
+    EXPECT_EQ(q1->children[0]->literal_content, "Level 1");
+    
+    EXPECT_EQ(q1->children[1]->type, BlockType::Quote);
+    auto* q2 = q1->children[1].get();
+    
+    ASSERT_EQ(q2->children.size(), 1);
+    EXPECT_EQ(q2->children[0]->literal_content, "Level 2");
+}