Changes
diff --git a/CMakeLists.txt b/CMakeLists.txt
index a0194da..2f4e86f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -37,7 +37,7 @@ include_directories(include)
# but for now we will just list them manually or add a library target.
# Let's create a core library for the logic to share between main and tests.
-add_library(macrodown_lib STATIC src/lib_placeholder.cpp src/macro_engine.cpp src/parser.cpp)
+add_library(macrodown_lib STATIC src/lib_placeholder.cpp src/macro_engine.cpp src/parser.cpp src/block_parser.cpp)
target_include_directories(macrodown_lib PUBLIC include)
target_link_libraries(macrodown_lib PUBLIC uni-algo::uni-algo)
@@ -47,7 +47,7 @@ target_link_libraries(macrodown PRIVATE macrodown_lib)
# Testing
enable_testing()
-add_executable(macrodown_test tests/test_main.cpp tests/test_macro_engine.cpp)
+add_executable(macrodown_test tests/test_main.cpp tests/test_macro_engine.cpp tests/test_block_parser.cpp)
target_link_libraries(macrodown_test PRIVATE macrodown_lib GTest::gtest_main)
target_include_directories(macrodown_test PRIVATE include)
diff --git a/include/block_parser.h b/include/block_parser.h
new file mode 100644
index 0000000..4a59447
--- /dev/null
+++ b/include/block_parser.h
@@ -0,0 +1,37 @@
+#ifndef MACRODOWN_BLOCK_PARSER_H
+#define MACRODOWN_BLOCK_PARSER_H
+
+#include <string>
+#include <vector>
+#include <memory>
+#include "block.h"
+
+namespace macrodown {
+
+class BlockParser {
+public:
+ // Main entry point: parses the entire document into a Block tree
+ static std::unique_ptr<Block> parse(const std::string& input);
+
+private:
+ struct BlockStackItem {
+ Block* block;
+ // Parsing state specific to this block type could go here
+ };
+
+ std::unique_ptr<Block> root;
+ std::vector<BlockStackItem> open_blocks;
+
+ BlockParser();
+ void process_line(const std::string& line);
+ void close_unmatched_blocks(size_t last_matched_index);
+ void add_text_to_current(const std::string& text);
+
+ // Checkers
+ bool is_container(BlockType type);
+ bool matches(Block* block, const std::string& line, size_t& offset);
+};
+
+} // namespace macrodown
+
+#endif // MACRODOWN_BLOCK_PARSER_H
diff --git a/src/block_parser.cpp b/src/block_parser.cpp
new file mode 100644
index 0000000..107853a
--- /dev/null
+++ b/src/block_parser.cpp
@@ -0,0 +1,224 @@
+#include "block_parser.h"
+#include <sstream>
+#include <iostream>
+#include <algorithm>
+
+namespace macrodown {
+
+namespace {
+
+// Helper: check if line is blank (only spaces)
+bool is_blank(const std::string& line) {
+ return std::all_of(line.begin(), line.end(), [](unsigned char c){ return std::isspace(c); });
+}
+
+// Helper: count indentation
+size_t count_indent(const std::string& line, size_t offset) {
+ size_t count = 0;
+ while (offset + count < line.size() && line[offset + count] == ' ') {
+ count++;
+ }
+ return count;
+}
+
+} // namespace
+
+BlockParser::BlockParser() {
+ root = std::make_unique<Block>(BlockType::Document);
+ open_blocks.push_back({root.get()});
+}
+
+std::unique_ptr<Block> BlockParser::parse(const std::string& input) {
+ BlockParser parser;
+ std::istringstream stream(input);
+ std::string line;
+
+ while (std::getline(stream, line)) {
+ // Remove \r if present (Windows line endings)
+ if (!line.empty() && line.back() == '\r') {
+ line.pop_back();
+ }
+ parser.process_line(line);
+ }
+
+ // Close all remaining blocks
+ parser.close_unmatched_blocks(0); // 0 means keep only root, but since we are done, close everything
+ // Actually we don't "close" root in the stack sense, but we mark open=false
+ parser.root->open = false;
+
+ return std::move(parser.root);
+}
+
+bool BlockParser::is_container(BlockType type) {
+ return type == BlockType::Document ||
+ type == BlockType::Quote ||
+ type == BlockType::List ||
+ type == BlockType::ListItem;
+}
+
+// Determines if an open block matches the current line
+// Updates offset to consume markers
+bool BlockParser::matches(Block* block, const std::string& line, size_t& offset) {
+ if (block->type == BlockType::Document) {
+ return true; // Document always matches
+ }
+
+ if (block->type == BlockType::Quote) {
+ size_t indent = count_indent(line, offset);
+ if (indent < 4) {
+ if (offset + indent < line.size() && line[offset + indent] == '>') {
+ offset += indent + 1; // Consume indent and '>'
+ if (offset < line.size() && line[offset] == ' ') offset++; // Optional space
+ return true;
+ }
+ }
+ return false;
+ }
+
+ if (block->type == BlockType::Paragraph) {
+ if (is_blank(line)) return false;
+
+ // Paragraphs match unless interrupted by a new block type
+ size_t indent = count_indent(line, offset);
+ if (indent < 4) {
+ // Check for BlockQuote
+ if (offset + indent < line.size() && line[offset + indent] == '>') return false;
+
+ // Check for ATX Heading
+ size_t check_pos = offset + indent;
+ if (check_pos < line.size() && line[check_pos] == '#') {
+ // Confirm it's a heading (sequence of # followed by space or end)
+ size_t hash_count = 0;
+ while (check_pos + hash_count < line.size() && line[check_pos + hash_count] == '#' && hash_count < 6) {
+ hash_count++;
+ }
+ if (hash_count > 0 && (check_pos + hash_count == line.size() || line[check_pos + hash_count] == ' ')) {
+ return false;
+ }
+ }
+ }
+
+ // It's a continuation
+ return true;
+ }
+
+ return false;
+}
+
+void BlockParser::process_line(const std::string& line) {
+ size_t offset = 0;
+ size_t matches_count = 0;
+
+ // 1. Find matches in open blocks
+ matches_count = 0; // Root always matches
+ for (size_t i = 1; i < open_blocks.size(); ++i) {
+ if (matches(open_blocks[i].block, line, offset)) {
+ matches_count = i;
+ } else {
+ break;
+ }
+ }
+
+ // 2. Close unmatched blocks
+ close_unmatched_blocks(matches_count);
+
+ // 3. Open new blocks
+ // Scan rest of line (at offset)
+
+ // Check for BlockQuote
+ while (true) {
+ size_t indent = count_indent(line, offset);
+ if (indent < 4 && offset + indent < line.size() && line[offset + indent] == '>') {
+ offset += indent + 1;
+ if (offset < line.size() && line[offset] == ' ') offset++;
+
+ auto new_block = std::make_unique<Block>(BlockType::Quote);
+ Block* ptr = new_block.get();
+ open_blocks.back().block->children.push_back(std::move(new_block));
+ open_blocks.push_back({ptr});
+ } else {
+ break;
+ }
+ }
+
+ // 4. Handle Leaf Blocks (Heading, ThematicBreak) or continuation
+
+ Block* tip = open_blocks.back().block;
+
+ // If tip is a Paragraph, check for blank line (closes it)
+ if (tip->type == BlockType::Paragraph) {
+ if (is_blank(line)) {
+ close_unmatched_blocks(open_blocks.size() - 2); // Close paragraph
+ return;
+ }
+ // Else, it's a continuation
+ // (Unless it's interrupted by a Heading/Quote etc. - Simplified: we assume it continues)
+ // Strictly, we should check if the line *starts* a new block (like Header)
+ // If it does, we close the paragraph.
+ }
+
+ // Check for ATX Heading
+ size_t indent = count_indent(line, offset);
+ if (indent < 4) {
+ size_t check_pos = offset + indent;
+ size_t hash_count = 0;
+ while (check_pos + hash_count < line.size() && line[check_pos + hash_count] == '#' && hash_count < 6) {
+ hash_count++;
+ }
+
+ if (hash_count > 0 && (check_pos + hash_count == line.size() || line[check_pos + hash_count] == ' ')) {
+ // Found Heading
+ // If we were in a paragraph, close it
+ if (tip->type == BlockType::Paragraph) {
+ close_unmatched_blocks(open_blocks.size() - 2);
+ tip = open_blocks.back().block;
+ }
+
+ auto heading = std::make_unique<Block>(BlockType::Heading);
+ heading->level = hash_count;
+ // Content is the rest of the line (trimmed)
+ size_t content_start = check_pos + hash_count;
+ while (content_start < line.size() && line[content_start] == ' ') content_start++;
+ heading->literal_content = line.substr(content_start);
+ // Remove trailing hashes? CommonMark says yes. Optional for now.
+ heading->open = false; // Headings are single line
+
+ tip->children.push_back(std::move(heading));
+ return;
+ }
+ }
+
+ // 5. Finalize: Text or Paragraph
+ if (is_blank(line)) {
+ return; // Ignore blank lines if not ending a paragraph
+ }
+
+ if (tip->type == BlockType::Document || tip->type == BlockType::Quote || tip->type == BlockType::List || tip->type == BlockType::ListItem) {
+ // Create new Paragraph
+ auto p = std::make_unique<Block>(BlockType::Paragraph);
+ Block* p_ptr = p.get();
+ tip->children.push_back(std::move(p));
+ open_blocks.push_back({p_ptr});
+
+ // Add text
+ // Note: indentation in paragraph text is preserved but leading spaces of the first line?
+ // CommonMark: stripped.
+ size_t content_start = offset + count_indent(line, offset);
+ p_ptr->literal_content = line.substr(content_start);
+ } else if (tip->type == BlockType::Paragraph) {
+ // Continuation
+ // Remove leading spaces up to indent? simplified: just add space + text
+ size_t content_start = offset + count_indent(line, offset);
+ tip->literal_content += "\n" + line.substr(content_start);
+ }
+}
+
+void BlockParser::close_unmatched_blocks(size_t last_matched_index) {
+ while (open_blocks.size() > last_matched_index + 1) {
+ Block* block = open_blocks.back().block;
+ block->open = false;
+ open_blocks.pop_back();
+ }
+}
+
+} // namespace macrodown
diff --git a/tests/test_block_parser.cpp b/tests/test_block_parser.cpp
new file mode 100644
index 0000000..8d1f6a3
--- /dev/null
+++ b/tests/test_block_parser.cpp
@@ -0,0 +1,97 @@
+#include <gtest/gtest.h>
+#include "block_parser.h"
+
+using namespace macrodown;
+
+TEST(BlockParserTest, SimpleParagraph) {
+ std::string input = "Hello\nWorld";
+ auto root = BlockParser::parse(input);
+
+ ASSERT_EQ(root->type, BlockType::Document);
+ ASSERT_EQ(root->children.size(), 1);
+
+ auto* p = root->children[0].get();
+ EXPECT_EQ(p->type, BlockType::Paragraph);
+ EXPECT_EQ(p->literal_content, "Hello\nWorld");
+}
+
+TEST(BlockParserTest, MultipleParagraphs) {
+ std::string input = "Para 1\n\nPara 2";
+ auto root = BlockParser::parse(input);
+
+ ASSERT_EQ(root->children.size(), 2);
+ EXPECT_EQ(root->children[0]->type, BlockType::Paragraph);
+ EXPECT_EQ(root->children[1]->type, BlockType::Paragraph);
+}
+
+TEST(BlockParserTest, Headers) {
+ std::string input = "# H1\n## H2";
+ auto root = BlockParser::parse(input);
+
+ ASSERT_EQ(root->children.size(), 2);
+
+ auto* h1 = root->children[0].get();
+ EXPECT_EQ(h1->type, BlockType::Heading);
+ EXPECT_EQ(h1->level, 1);
+ EXPECT_EQ(h1->literal_content, "H1");
+
+ auto* h2 = root->children[1].get();
+ EXPECT_EQ(h2->type, BlockType::Heading);
+ EXPECT_EQ(h2->level, 2);
+ EXPECT_EQ(h2->literal_content, "H2");
+}
+
+TEST(BlockParserTest, BlockQuote) {
+ std::string input = "> Hello\n> World";
+ auto root = BlockParser::parse(input);
+
+ ASSERT_EQ(root->children.size(), 1);
+ auto* quote = root->children[0].get();
+ EXPECT_EQ(quote->type, BlockType::Quote);
+
+ ASSERT_EQ(quote->children.size(), 1);
+ auto* p = quote->children[0].get();
+ EXPECT_EQ(p->type, BlockType::Paragraph);
+ EXPECT_EQ(p->literal_content, "Hello\nWorld");
+}
+
+TEST(BlockParserTest, NestedQuote) {
+ std::string input = "> Level 1\n>> Level 2";
+ auto root = BlockParser::parse(input);
+
+ ASSERT_EQ(root->children.size(), 1);
+ auto* q1 = root->children[0].get();
+ EXPECT_EQ(q1->type, BlockType::Quote);
+
+ // Structure: Quote -> [Paragraph("Level 1"), Quote -> [Paragraph("Level 2")]]
+ // Wait, my parser closes paragraph on new block type if strict?
+ // "Level 1" starts a paragraph.
+ // ">> Level 2":
+ // Line 1: "> Level 1". Matches Quote. Adds "Level 1" to P.
+ // Line 2: ">> Level 2". Matches Quote (q1).
+ // Scan rest: "> Level 2". Finds nested Quote (q2).
+ // Scan rest: "Level 2". Adds to new P inside q2.
+ // But what about the P inside q1? "Level 1".
+ // Does it close?
+ // My parser logic:
+ // matches(q1) -> true. matches_count = 1.
+ // Scan for new blocks: finds q2.
+ // q1->children (was [P]) now gets q2 pushed?
+ // Yes. P remains in children list. P is closed?
+ // "close_unmatched_blocks" only closes if mismatch.
+ // But Step 4 "Handle Leaf Blocks":
+ // tip is now q2. (Since we opened q2).
+ // So P (inside q1) is no longer the "tip" (back of open_blocks).
+ // Correct.
+
+ ASSERT_EQ(q1->children.size(), 2); // P("Level 1") + Quote
+
+ EXPECT_EQ(q1->children[0]->type, BlockType::Paragraph);
+ EXPECT_EQ(q1->children[0]->literal_content, "Level 1");
+
+ EXPECT_EQ(q1->children[1]->type, BlockType::Quote);
+ auto* q2 = q1->children[1].get();
+
+ ASSERT_EQ(q2->children.size(), 1);
+ EXPECT_EQ(q2->children[0]->literal_content, "Level 2");
+}