BareGit

Refactor parser.cpp: Extract InlineParser class and add comments

Author: MetroWind <chris.corsair@gmail.com>
Date: Sat Jan 24 13:05:52 2026 -0800
Commit: 97ca91af8b61b015e6679aada9c8aa1caf29039f

Changes

diff --git a/src/parser.cpp b/src/parser.cpp
index 16e256f..8dcdc08 100644
--- a/src/parser.cpp
+++ b/src/parser.cpp
@@ -6,191 +6,251 @@
 namespace macrodown
 {
 
-std::vector<std::unique_ptr<Node>> Parser::parse(
-    const std::string& input,
-    const std::vector<PrefixMarkup>& prefix_markups,
-    const std::vector<DelimitedMarkup>& delimited_markups)
+namespace
 {
-    std::u32string input32 = una::utf8to32u(input);
-    std::vector<std::unique_ptr<Node>> nodes;
-    std::u32string current_text;
 
-    auto flush_text = [&]()
+struct PrefixInfo
+{
+    char32_t cp;
+    std::string macro;
+};
+
+struct DelimInfo
+{
+    char32_t cp;
+    std::string macro;
+};
+
+// InlineParser handles the parsing of inline elements within a block of text.
+// It scans the input string character by character (in UTF-32) and dispatches
+// to specific handlers for different markup types.
+class InlineParser
+{
+public:
+    InlineParser(const std::string& input,
+                 const std::vector<PrefixMarkup>& prefix_markups,
+                 const std::vector<DelimitedMarkup>& delimited_markups)
+        : input32_(una::utf8to32u(input)),
+          prefix_markups_(prefix_markups),
+          delimited_markups_(delimited_markups)
     {
-        if(!current_text.empty())
+        // Pre-process markup definitions into lookup tables for efficiency
+        for (const auto& m : prefix_markups_)
         {
-            nodes.push_back(std::make_unique<Node>(Text{una::utf32to8(current_text)}));
-            current_text.clear();
+            auto cp = una::utf8to32u(m.prefix);
+            if (!cp.empty())
+                p_infos_.push_back({cp[0], m.macro_name});
         }
-    };
-    
-    // Pre-calculate code points for custom markups
-    struct PrefixInfo {
-        char32_t cp;
-        std::string macro;
-    };
-    std::vector<PrefixInfo> p_infos;
-    for(const auto& m : prefix_markups)
+        for (const auto& m : delimited_markups_)
+        {
+            auto cp = una::utf8to32u(m.delimiter);
+            if (!cp.empty())
+                d_infos_.push_back({cp[0], m.macro_name});
+        }
+    }
+
+    // Main parsing loop. Iterates through the input string and attempts to
+    // match markup elements. If a match is found, the handler advances the
+    // position. Otherwise, the current character is treated as plain text.
+    std::vector<std::unique_ptr<Node>> parse()
     {
-        auto cp = una::utf8to32u(m.prefix);
-        if(!cp.empty()) p_infos.push_back({cp[0], m.macro_name});
+        while (pos_ < input32_.length())
+        {
+            if (handle_escape()) continue;
+            if (handle_prefix_markup()) continue;
+            if (handle_delimited_markup()) continue;
+            if (handle_macro()) continue;
+            if (handle_code()) continue;
+            if (handle_link()) continue;
+            if (handle_emphasis()) continue;
+
+            current_text_ += input32_[pos_];
+            pos_++;
+        }
+        flush_text();
+        return std::move(nodes_);
     }
 
-    struct DelimInfo {
-        char32_t cp;
-        std::string macro;
-    };
-    std::vector<DelimInfo> d_infos;
-    for(const auto& m : delimited_markups)
+private:
+    std::u32string input32_;
+    const std::vector<PrefixMarkup>& prefix_markups_;
+    const std::vector<DelimitedMarkup>& delimited_markups_;
+
+    std::vector<PrefixInfo> p_infos_;
+    std::vector<DelimInfo> d_infos_;
+
+    size_t pos_ = 0;
+    std::vector<std::unique_ptr<Node>> nodes_;
+    std::u32string current_text_;
+
+    // Pushes accumulated plain text into the node list as a Text node.
+    void flush_text()
     {
-        auto cp = una::utf8to32u(m.delimiter);
-        if(!cp.empty()) d_infos.push_back({cp[0], m.macro_name});
+        if (!current_text_.empty())
+        {
+            nodes_.push_back(std::make_unique<Node>(Text{una::utf32to8(current_text_)}));
+            current_text_.clear();
+        }
     }
 
-    size_t i = 0;
-    while(i < input32.length())
+    // Handles backslash escapes. Consumes the backslash and appends the
+    // following character literally to the text buffer.
+    bool handle_escape()
     {
-        char32_t c = input32[i];
-        
-        // Escape handling
-        if(c == '\\' && i + 1 < input32.length())
+        if (input32_[pos_] == '\\' && pos_ + 1 < input32_.length())
         {
-            current_text += input32[i+1];
-            i += 2;
-            continue;
+            current_text_ += input32_[pos_ + 1];
+            pos_ += 2;
+            return true;
         }
+        return false;
+    }
 
-        // Custom Prefix Markup
-        bool matched_prefix = false;
-        for(const auto& info : p_infos)
+    // Handles user-defined prefix markups (e.g., #tag).
+    // Matches a specific prefix character and captures text until a boundary
+    // (whitespace or punctuation, excluding underscore).
+    bool handle_prefix_markup()
+    {
+        for (const auto& info : p_infos_)
         {
-            if(c == info.cp)
+            if (input32_[pos_] == info.cp)
             {
-                // Scan until whitespace or punctuation (except _)
-                size_t j = i + 1;
-                while(j < input32.length())
+                size_t j = pos_ + 1;
+                while (j < input32_.length())
                 {
-                    char32_t next_c = input32[j];
-                    if(una::codepoint::is_whitespace(next_c)) break;
-                    // Punctuation check using uni-algo
-                    if(next_c != '_' && una::codepoint::prop{next_c}.General_Category_P()) break;
+                    char32_t next_c = input32_[j];
+                    if (una::codepoint::is_whitespace(next_c)) break;
+                    if (next_c != '_' && una::codepoint::prop{next_c}.General_Category_P()) break;
                     j++;
                 }
-                
-                if(j > i + 1)
+
+                if (j > pos_ + 1)
                 {
                     flush_text();
-                    std::u32string content = input32.substr(i + 1, j - (i + 1));
+                    std::u32string content = input32_.substr(pos_ + 1, j - (pos_ + 1));
                     Macro macro;
                     macro.name = info.macro;
                     Group group;
                     group.addChild(std::make_unique<Node>(Text{una::utf32to8(content)}));
                     macro.arguments.push_back(std::make_unique<Node>(std::move(group)));
-                    nodes.push_back(std::make_unique<Node>(std::move(macro)));
-                    i = j;
-                    matched_prefix = true;
-                    break;
+                    nodes_.push_back(std::make_unique<Node>(std::move(macro)));
+                    pos_ = j;
+                    return true;
                 }
             }
         }
-        if(matched_prefix) continue;
+        return false;
+    }
 
-        // Custom Delimited Markup
-        bool matched_delim = false;
-        for(const auto& info : d_infos)
+    // Handles user-defined delimited markups (e.g., :highlight:).
+    // Matches a delimiter character and searches for a closing delimiter.
+    // Enforces strict rules: no whitespace inside, no punctuation boundaries.
+    bool handle_delimited_markup()
+    {
+        for (const auto& info : d_infos_)
         {
-            if(c == info.cp)
+            if (input32_[pos_] == info.cp)
             {
-                size_t j = i + 1;
+                size_t j = pos_ + 1;
                 bool valid = true;
                 bool found_end = false;
-                while(j < input32.length())
+                while (j < input32_.length())
                 {
-                    char32_t next_c = input32[j];
-                    if(next_c == info.cp)
+                    char32_t next_c = input32_[j];
+                    if (next_c == info.cp)
                     {
                         found_end = true;
                         break;
                     }
-                    if(una::codepoint::is_whitespace(next_c)) { valid = false; break; }
-                    if(next_c != '_' && next_c != '-' && una::codepoint::prop{next_c}.General_Category_P()) { valid = false; break; }
+                    if (una::codepoint::is_whitespace(next_c))
+                    {
+                        valid = false;
+                        break;
+                    }
+                    if (next_c != '_' && next_c != '-' && una::codepoint::prop{next_c}.General_Category_P())
+                    {
+                        valid = false;
+                        break;
+                    }
                     j++;
                 }
-                
-                if(found_end && valid && j > i + 1)
+
+                if (found_end && valid && j > pos_ + 1)
                 {
                     flush_text();
-                    std::u32string content = input32.substr(i + 1, j - (i + 1));
+                    std::u32string content = input32_.substr(pos_ + 1, j - (pos_ + 1));
                     Macro macro;
                     macro.name = info.macro;
                     Group group;
                     group.addChild(std::make_unique<Node>(Text{una::utf32to8(content)}));
                     macro.arguments.push_back(std::make_unique<Node>(std::move(group)));
-                    nodes.push_back(std::make_unique<Node>(std::move(macro)));
-                    i = j + 1;
-                    matched_delim = true;
-                    break;
+                    nodes_.push_back(std::make_unique<Node>(std::move(macro)));
+                    pos_ = j + 1;
+                    return true;
                 }
             }
         }
-        if(matched_delim) continue;
+        return false;
+    }
 
-        // Macro: %name{args}...
-        if(c == '%')
+    // Handles standard macro calls (e.g., %name{arg}).
+    // Parses the macro name and recursively parses its arguments enclosed in {} or [].
+    bool handle_macro()
+    {
+        if (input32_[pos_] == '%')
         {
             flush_text();
-            
-            i++; // skip %
-            size_t name_start = i;
-            while(i < input32.length() && (una::codepoint::is_alphanumeric(input32[i]) || input32[i] == '_'))
+            pos_++; // skip %
+            size_t name_start = pos_;
+            while (pos_ < input32_.length() && (una::codepoint::is_alphanumeric(input32_[pos_]) || input32_[pos_] == '_'))
             {
-                i++;
+                pos_++;
             }
-            std::u32string name32 = input32.substr(name_start, i - name_start);
+            std::u32string name32 = input32_.substr(name_start, pos_ - name_start);
             std::string name = una::utf32to8(name32);
-            
-            if(name.empty())
+
+            if (name.empty())
             {
-                current_text += '%';
-                continue;
+                current_text_ += '%';
+                // Continue from loop
+                return true; 
             }
 
             Macro macro;
             macro.name = name;
 
-            // Parse Arguments
-            while(i < input32.length())
+            while (pos_ < input32_.length())
             {
-                char32_t open = input32[i];
-                if(open == '{' || open == '[')
+                char32_t open = input32_[pos_];
+                if (open == '{' || open == '[')
                 {
                     char32_t close = (open == '{') ? '}' : ']';
-                    i++; 
-                    
+                    pos_++;
+
                     std::u32string arg_content32;
                     int balance = 1;
-                    while(i < input32.length() && balance > 0)
+                    while (pos_ < input32_.length() && balance > 0)
                     {
-                        if(input32[i] == open)
+                        if (input32_[pos_] == open)
                         {
                             balance++;
-                            arg_content32 += input32[i];
+                            arg_content32 += input32_[pos_];
                         }
-                        else if(input32[i] == close)
+                        else if (input32_[pos_] == close)
                         {
                             balance--;
-                            if(balance > 0) arg_content32 += input32[i];
+                            if (balance > 0) arg_content32 += input32_[pos_];
                         }
                         else
                         {
-                            arg_content32 += input32[i];
+                            arg_content32 += input32_[pos_];
                         }
-                        i++;
+                        pos_++;
                     }
-                    
+
                     Group group;
-                    std::vector<std::unique_ptr<Node>> sub_nodes = parse(una::utf32to8(arg_content32), prefix_markups, delimited_markups);
-                    for(auto& n : sub_nodes)
+                    auto sub_nodes = Parser::parse(una::utf32to8(arg_content32), prefix_markups_, delimited_markups_);
+                    for (auto& n : sub_nodes)
                     {
                         group.addChild(std::move(n));
                     }
@@ -198,118 +258,138 @@ std::vector<std::unique_ptr<Node>> Parser::parse(
                 }
                 else
                 {
-                    break; 
+                    break;
                 }
             }
-            nodes.push_back(std::make_unique<Node>(std::move(macro)));
-            continue;
+            nodes_.push_back(std::make_unique<Node>(std::move(macro)));
+            return true;
         }
-        
-        // Inline Code: `...`
-        if(c == '`')
+        return false;
+    }
+
+    // Handles inline code blocks enclosed in backticks (`code`).
+    bool handle_code()
+    {
+        if (input32_[pos_] == '`')
         {
-            size_t start = i + 1;
-            size_t end = input32.find('`', start);
-            if(end != std::u32string::npos)
+            size_t start = pos_ + 1;
+            size_t end = input32_.find('`', start);
+            if (end != std::u32string::npos)
             {
                 flush_text();
-                std::u32string content32 = input32.substr(start, end - start);
-                
+                std::u32string content32 = input32_.substr(start, end - start);
+
                 Macro macro;
                 macro.name = "code";
-                
+
                 Group group;
                 group.addChild(std::make_unique<Node>(Text{una::utf32to8(content32)}));
-                
+
                 macro.arguments.push_back(std::make_unique<Node>(std::move(group)));
-                nodes.push_back(std::make_unique<Node>(std::move(macro)));
-                i = end + 1;
-                continue;
+                nodes_.push_back(std::make_unique<Node>(std::move(macro)));
+                pos_ = end + 1;
+                return true;
             }
         }
-        
-        // Link: [text](url)
-        if(c == '[')
+        return false;
+    }
+
+    // Handles Markdown links ([text](url)).
+    // Recursively parses the link text.
+    bool handle_link()
+    {
+        if (input32_[pos_] == '[')
         {
-            size_t label_start = i + 1;
+            size_t label_start = pos_ + 1;
             size_t j = label_start;
             int bracket_bal = 1;
-            while(j < input32.length() && bracket_bal > 0)
+            while (j < input32_.length() && bracket_bal > 0)
             {
-                if(input32[j] == '[') bracket_bal++;
-                else if(input32[j] == ']') bracket_bal--;
-                if(bracket_bal > 0) j++;
+                if (input32_[j] == '[') bracket_bal++;
+                else if (input32_[j] == ']') bracket_bal--;
+                if (bracket_bal > 0) j++;
             }
-            
-            if(j < input32.length() && bracket_bal == 0)
+
+            if (j < input32_.length() && bracket_bal == 0)
             {
                 size_t close_bracket = j;
-                if(close_bracket + 1 < input32.length() && input32[close_bracket + 1] == '(')
+                if (close_bracket + 1 < input32_.length() && input32_[close_bracket + 1] == '(')
                 {
                     size_t url_start = close_bracket + 2;
-                    size_t url_end = input32.find(')', url_start);
-                    if(url_end != std::u32string::npos)
+                    size_t url_end = input32_.find(')', url_start);
+                    if (url_end != std::u32string::npos)
                     {
                         flush_text();
-                        std::u32string label32 = input32.substr(label_start, close_bracket - label_start);
-                        std::u32string url32 = input32.substr(url_start, url_end - url_start);
-                        
+                        std::u32string label32 = input32_.substr(label_start, close_bracket - label_start);
+                        std::u32string url32 = input32_.substr(url_start, url_end - url_start);
+
                         Macro macro;
                         macro.name = "link";
-                        
+
                         // Arg 1: URL
                         Group group1;
                         group1.addChild(std::make_unique<Node>(Text{una::utf32to8(url32)}));
                         macro.arguments.push_back(std::make_unique<Node>(std::move(group1)));
-                        
+
                         // Arg 2: Text (parsed)
                         Group group2;
-                        auto sub = parse(una::utf32to8(label32), prefix_markups, delimited_markups);
-                        for(auto& n : sub) group2.addChild(std::move(n));
+                        auto sub = Parser::parse(una::utf32to8(label32), prefix_markups_, delimited_markups_);
+                        for (auto& n : sub) group2.addChild(std::move(n));
                         macro.arguments.push_back(std::make_unique<Node>(std::move(group2)));
-                        
-                        nodes.push_back(std::make_unique<Node>(std::move(macro)));
-                        i = url_end + 1;
-                        continue;
+
+                        nodes_.push_back(std::make_unique<Node>(std::move(macro)));
+                        pos_ = url_end + 1;
+                        return true;
                     }
                 }
             }
         }
-        
-        // Emphasis: * or **
-        if(c == '*')
+        return false;
+    }
+
+    // Handles emphasis (*em*) and strong emphasis (**strong**).
+    // Recursively parses the content.
+    bool handle_emphasis()
+    {
+        if (input32_[pos_] == '*')
         {
-            bool strong = (i + 1 < input32.length() && input32[i+1] == '*');
-            size_t start_content = i + (strong ? 2 : 1);
-            
+            bool strong = (pos_ + 1 < input32_.length() && input32_[pos_ + 1] == '*');
+            size_t start_content = pos_ + (strong ? 2 : 1);
+
             std::u32string delim = strong ? U"**" : U"*";
-            size_t end = input32.find(delim, start_content);
-            
-            if(end != std::u32string::npos)
+            size_t end = input32_.find(delim, start_content);
+
+            if (end != std::u32string::npos)
             {
                 flush_text();
-                std::u32string content32 = input32.substr(start_content, end - start_content);
-                
+                std::u32string content32 = input32_.substr(start_content, end - start_content);
+
                 Macro macro;
                 macro.name = strong ? "strong" : "em";
-                
+
                 Group group;
-                auto sub = parse(una::utf32to8(content32), prefix_markups, delimited_markups);
-                for(auto& n : sub) group.addChild(std::move(n));
+                auto sub = Parser::parse(una::utf32to8(content32), prefix_markups_, delimited_markups_);
+                for (auto& n : sub) group.addChild(std::move(n));
                 macro.arguments.push_back(std::make_unique<Node>(std::move(group)));
-                
-                nodes.push_back(std::make_unique<Node>(std::move(macro)));
-                i = end + delim.length();
-                continue;
+
+                nodes_.push_back(std::make_unique<Node>(std::move(macro)));
+                pos_ = end + delim.length();
+                return true;
             }
         }
-
-        current_text += c;
-        i++;
+        return false;
     }
-    
-    flush_text();
-    return nodes;
+};
+
+} // namespace
+
+std::vector<std::unique_ptr<Node>> Parser::parse(
+    const std::string& input,
+    const std::vector<PrefixMarkup>& prefix_markups,
+    const std::vector<DelimitedMarkup>& delimited_markups)
+{
+    InlineParser parser(input, prefix_markups, delimited_markups);
+    return parser.parse();
 }
 
 } // namespace macrodown