Changes
diff --git a/src/parser.cpp b/src/parser.cpp
index 16e256f..8dcdc08 100644
--- a/src/parser.cpp
+++ b/src/parser.cpp
@@ -6,191 +6,251 @@
namespace macrodown
{
-std::vector<std::unique_ptr<Node>> Parser::parse(
- const std::string& input,
- const std::vector<PrefixMarkup>& prefix_markups,
- const std::vector<DelimitedMarkup>& delimited_markups)
+namespace
{
- std::u32string input32 = una::utf8to32u(input);
- std::vector<std::unique_ptr<Node>> nodes;
- std::u32string current_text;
- auto flush_text = [&]()
+struct PrefixInfo
+{
+ char32_t cp;
+ std::string macro;
+};
+
+struct DelimInfo
+{
+ char32_t cp;
+ std::string macro;
+};
+
+// InlineParser handles the parsing of inline elements within a block of text.
+// It scans the input string character by character (in UTF-32) and dispatches
+// to specific handlers for different markup types.
+class InlineParser
+{
+public:
+ InlineParser(const std::string& input,
+ const std::vector<PrefixMarkup>& prefix_markups,
+ const std::vector<DelimitedMarkup>& delimited_markups)
+ : input32_(una::utf8to32u(input)),
+ prefix_markups_(prefix_markups),
+ delimited_markups_(delimited_markups)
{
- if(!current_text.empty())
+ // Pre-process markup definitions into lookup tables for efficiency
+ for (const auto& m : prefix_markups_)
{
- nodes.push_back(std::make_unique<Node>(Text{una::utf32to8(current_text)}));
- current_text.clear();
+ auto cp = una::utf8to32u(m.prefix);
+ if (!cp.empty())
+ p_infos_.push_back({cp[0], m.macro_name});
}
- };
-
- // Pre-calculate code points for custom markups
- struct PrefixInfo {
- char32_t cp;
- std::string macro;
- };
- std::vector<PrefixInfo> p_infos;
- for(const auto& m : prefix_markups)
+ for (const auto& m : delimited_markups_)
+ {
+ auto cp = una::utf8to32u(m.delimiter);
+ if (!cp.empty())
+ d_infos_.push_back({cp[0], m.macro_name});
+ }
+ }
+
+ // Main parsing loop. Iterates through the input string and attempts to
+ // match markup elements. If a match is found, the handler advances the
+ // position. Otherwise, the current character is treated as plain text.
+ std::vector<std::unique_ptr<Node>> parse()
{
- auto cp = una::utf8to32u(m.prefix);
- if(!cp.empty()) p_infos.push_back({cp[0], m.macro_name});
+ while (pos_ < input32_.length())
+ {
+ if (handle_escape()) continue;
+ if (handle_prefix_markup()) continue;
+ if (handle_delimited_markup()) continue;
+ if (handle_macro()) continue;
+ if (handle_code()) continue;
+ if (handle_link()) continue;
+ if (handle_emphasis()) continue;
+
+ current_text_ += input32_[pos_];
+ pos_++;
+ }
+ flush_text();
+ return std::move(nodes_);
}
- struct DelimInfo {
- char32_t cp;
- std::string macro;
- };
- std::vector<DelimInfo> d_infos;
- for(const auto& m : delimited_markups)
+private:
+ std::u32string input32_;
+ const std::vector<PrefixMarkup>& prefix_markups_;
+ const std::vector<DelimitedMarkup>& delimited_markups_;
+
+ std::vector<PrefixInfo> p_infos_;
+ std::vector<DelimInfo> d_infos_;
+
+ size_t pos_ = 0;
+ std::vector<std::unique_ptr<Node>> nodes_;
+ std::u32string current_text_;
+
+ // Pushes accumulated plain text into the node list as a Text node.
+ void flush_text()
{
- auto cp = una::utf8to32u(m.delimiter);
- if(!cp.empty()) d_infos.push_back({cp[0], m.macro_name});
+ if (!current_text_.empty())
+ {
+ nodes_.push_back(std::make_unique<Node>(Text{una::utf32to8(current_text_)}));
+ current_text_.clear();
+ }
}
- size_t i = 0;
- while(i < input32.length())
+ // Handles backslash escapes. Consumes the backslash and appends the
+ // following character literally to the text buffer.
+ bool handle_escape()
{
- char32_t c = input32[i];
-
- // Escape handling
- if(c == '\\' && i + 1 < input32.length())
+ if (input32_[pos_] == '\\' && pos_ + 1 < input32_.length())
{
- current_text += input32[i+1];
- i += 2;
- continue;
+ current_text_ += input32_[pos_ + 1];
+ pos_ += 2;
+ return true;
}
+ return false;
+ }
- // Custom Prefix Markup
- bool matched_prefix = false;
- for(const auto& info : p_infos)
+ // Handles user-defined prefix markups (e.g., #tag).
+ // Matches a specific prefix character and captures text until a boundary
+ // (whitespace or punctuation, excluding underscore).
+ bool handle_prefix_markup()
+ {
+ for (const auto& info : p_infos_)
{
- if(c == info.cp)
+ if (input32_[pos_] == info.cp)
{
- // Scan until whitespace or punctuation (except _)
- size_t j = i + 1;
- while(j < input32.length())
+ size_t j = pos_ + 1;
+ while (j < input32_.length())
{
- char32_t next_c = input32[j];
- if(una::codepoint::is_whitespace(next_c)) break;
- // Punctuation check using uni-algo
- if(next_c != '_' && una::codepoint::prop{next_c}.General_Category_P()) break;
+ char32_t next_c = input32_[j];
+ if (una::codepoint::is_whitespace(next_c)) break;
+ if (next_c != '_' && una::codepoint::prop{next_c}.General_Category_P()) break;
j++;
}
-
- if(j > i + 1)
+
+ if (j > pos_ + 1)
{
flush_text();
- std::u32string content = input32.substr(i + 1, j - (i + 1));
+ std::u32string content = input32_.substr(pos_ + 1, j - (pos_ + 1));
Macro macro;
macro.name = info.macro;
Group group;
group.addChild(std::make_unique<Node>(Text{una::utf32to8(content)}));
macro.arguments.push_back(std::make_unique<Node>(std::move(group)));
- nodes.push_back(std::make_unique<Node>(std::move(macro)));
- i = j;
- matched_prefix = true;
- break;
+ nodes_.push_back(std::make_unique<Node>(std::move(macro)));
+ pos_ = j;
+ return true;
}
}
}
- if(matched_prefix) continue;
+ return false;
+ }
- // Custom Delimited Markup
- bool matched_delim = false;
- for(const auto& info : d_infos)
+ // Handles user-defined delimited markups (e.g., :highlight:).
+ // Matches a delimiter character and searches for a closing delimiter.
+ // Enforces strict rules: no whitespace inside, no punctuation boundaries.
+ bool handle_delimited_markup()
+ {
+ for (const auto& info : d_infos_)
{
- if(c == info.cp)
+ if (input32_[pos_] == info.cp)
{
- size_t j = i + 1;
+ size_t j = pos_ + 1;
bool valid = true;
bool found_end = false;
- while(j < input32.length())
+ while (j < input32_.length())
{
- char32_t next_c = input32[j];
- if(next_c == info.cp)
+ char32_t next_c = input32_[j];
+ if (next_c == info.cp)
{
found_end = true;
break;
}
- if(una::codepoint::is_whitespace(next_c)) { valid = false; break; }
- if(next_c != '_' && next_c != '-' && una::codepoint::prop{next_c}.General_Category_P()) { valid = false; break; }
+ if (una::codepoint::is_whitespace(next_c))
+ {
+ valid = false;
+ break;
+ }
+ if (next_c != '_' && next_c != '-' && una::codepoint::prop{next_c}.General_Category_P())
+ {
+ valid = false;
+ break;
+ }
j++;
}
-
- if(found_end && valid && j > i + 1)
+
+ if (found_end && valid && j > pos_ + 1)
{
flush_text();
- std::u32string content = input32.substr(i + 1, j - (i + 1));
+ std::u32string content = input32_.substr(pos_ + 1, j - (pos_ + 1));
Macro macro;
macro.name = info.macro;
Group group;
group.addChild(std::make_unique<Node>(Text{una::utf32to8(content)}));
macro.arguments.push_back(std::make_unique<Node>(std::move(group)));
- nodes.push_back(std::make_unique<Node>(std::move(macro)));
- i = j + 1;
- matched_delim = true;
- break;
+ nodes_.push_back(std::make_unique<Node>(std::move(macro)));
+ pos_ = j + 1;
+ return true;
}
}
}
- if(matched_delim) continue;
+ return false;
+ }
- // Macro: %name{args}...
- if(c == '%')
+ // Handles standard macro calls (e.g., %name{arg}).
+ // Parses the macro name and recursively parses its arguments enclosed in {} or [].
+ bool handle_macro()
+ {
+ if (input32_[pos_] == '%')
{
flush_text();
-
- i++; // skip %
- size_t name_start = i;
- while(i < input32.length() && (una::codepoint::is_alphanumeric(input32[i]) || input32[i] == '_'))
+ pos_++; // skip %
+ size_t name_start = pos_;
+ while (pos_ < input32_.length() && (una::codepoint::is_alphanumeric(input32_[pos_]) || input32_[pos_] == '_'))
{
- i++;
+ pos_++;
}
- std::u32string name32 = input32.substr(name_start, i - name_start);
+ std::u32string name32 = input32_.substr(name_start, pos_ - name_start);
std::string name = una::utf32to8(name32);
-
- if(name.empty())
+
+ if (name.empty())
{
- current_text += '%';
- continue;
+ current_text_ += '%';
+ // Continue from loop
+ return true;
}
Macro macro;
macro.name = name;
- // Parse Arguments
- while(i < input32.length())
+ while (pos_ < input32_.length())
{
- char32_t open = input32[i];
- if(open == '{' || open == '[')
+ char32_t open = input32_[pos_];
+ if (open == '{' || open == '[')
{
char32_t close = (open == '{') ? '}' : ']';
- i++;
-
+ pos_++;
+
std::u32string arg_content32;
int balance = 1;
- while(i < input32.length() && balance > 0)
+ while (pos_ < input32_.length() && balance > 0)
{
- if(input32[i] == open)
+ if (input32_[pos_] == open)
{
balance++;
- arg_content32 += input32[i];
+ arg_content32 += input32_[pos_];
}
- else if(input32[i] == close)
+ else if (input32_[pos_] == close)
{
balance--;
- if(balance > 0) arg_content32 += input32[i];
+ if (balance > 0) arg_content32 += input32_[pos_];
}
else
{
- arg_content32 += input32[i];
+ arg_content32 += input32_[pos_];
}
- i++;
+ pos_++;
}
-
+
Group group;
- std::vector<std::unique_ptr<Node>> sub_nodes = parse(una::utf32to8(arg_content32), prefix_markups, delimited_markups);
- for(auto& n : sub_nodes)
+ auto sub_nodes = Parser::parse(una::utf32to8(arg_content32), prefix_markups_, delimited_markups_);
+ for (auto& n : sub_nodes)
{
group.addChild(std::move(n));
}
@@ -198,118 +258,138 @@ std::vector<std::unique_ptr<Node>> Parser::parse(
}
else
{
- break;
+ break;
}
}
- nodes.push_back(std::make_unique<Node>(std::move(macro)));
- continue;
+ nodes_.push_back(std::make_unique<Node>(std::move(macro)));
+ return true;
}
-
- // Inline Code: `...`
- if(c == '`')
+ return false;
+ }
+
+ // Handles inline code blocks enclosed in backticks (`code`).
+ bool handle_code()
+ {
+ if (input32_[pos_] == '`')
{
- size_t start = i + 1;
- size_t end = input32.find('`', start);
- if(end != std::u32string::npos)
+ size_t start = pos_ + 1;
+ size_t end = input32_.find('`', start);
+ if (end != std::u32string::npos)
{
flush_text();
- std::u32string content32 = input32.substr(start, end - start);
-
+ std::u32string content32 = input32_.substr(start, end - start);
+
Macro macro;
macro.name = "code";
-
+
Group group;
group.addChild(std::make_unique<Node>(Text{una::utf32to8(content32)}));
-
+
macro.arguments.push_back(std::make_unique<Node>(std::move(group)));
- nodes.push_back(std::make_unique<Node>(std::move(macro)));
- i = end + 1;
- continue;
+ nodes_.push_back(std::make_unique<Node>(std::move(macro)));
+ pos_ = end + 1;
+ return true;
}
}
-
- // Link: [text](url)
- if(c == '[')
+ return false;
+ }
+
+ // Handles Markdown links ([text](url)).
+ // Recursively parses the link text.
+ bool handle_link()
+ {
+ if (input32_[pos_] == '[')
{
- size_t label_start = i + 1;
+ size_t label_start = pos_ + 1;
size_t j = label_start;
int bracket_bal = 1;
- while(j < input32.length() && bracket_bal > 0)
+ while (j < input32_.length() && bracket_bal > 0)
{
- if(input32[j] == '[') bracket_bal++;
- else if(input32[j] == ']') bracket_bal--;
- if(bracket_bal > 0) j++;
+ if (input32_[j] == '[') bracket_bal++;
+ else if (input32_[j] == ']') bracket_bal--;
+ if (bracket_bal > 0) j++;
}
-
- if(j < input32.length() && bracket_bal == 0)
+
+ if (j < input32_.length() && bracket_bal == 0)
{
size_t close_bracket = j;
- if(close_bracket + 1 < input32.length() && input32[close_bracket + 1] == '(')
+ if (close_bracket + 1 < input32_.length() && input32_[close_bracket + 1] == '(')
{
size_t url_start = close_bracket + 2;
- size_t url_end = input32.find(')', url_start);
- if(url_end != std::u32string::npos)
+ size_t url_end = input32_.find(')', url_start);
+ if (url_end != std::u32string::npos)
{
flush_text();
- std::u32string label32 = input32.substr(label_start, close_bracket - label_start);
- std::u32string url32 = input32.substr(url_start, url_end - url_start);
-
+ std::u32string label32 = input32_.substr(label_start, close_bracket - label_start);
+ std::u32string url32 = input32_.substr(url_start, url_end - url_start);
+
Macro macro;
macro.name = "link";
-
+
// Arg 1: URL
Group group1;
group1.addChild(std::make_unique<Node>(Text{una::utf32to8(url32)}));
macro.arguments.push_back(std::make_unique<Node>(std::move(group1)));
-
+
// Arg 2: Text (parsed)
Group group2;
- auto sub = parse(una::utf32to8(label32), prefix_markups, delimited_markups);
- for(auto& n : sub) group2.addChild(std::move(n));
+ auto sub = Parser::parse(una::utf32to8(label32), prefix_markups_, delimited_markups_);
+ for (auto& n : sub) group2.addChild(std::move(n));
macro.arguments.push_back(std::make_unique<Node>(std::move(group2)));
-
- nodes.push_back(std::make_unique<Node>(std::move(macro)));
- i = url_end + 1;
- continue;
+
+ nodes_.push_back(std::make_unique<Node>(std::move(macro)));
+ pos_ = url_end + 1;
+ return true;
}
}
}
}
-
- // Emphasis: * or **
- if(c == '*')
+ return false;
+ }
+
+ // Handles emphasis (*em*) and strong emphasis (**strong**).
+ // Recursively parses the content.
+ bool handle_emphasis()
+ {
+ if (input32_[pos_] == '*')
{
- bool strong = (i + 1 < input32.length() && input32[i+1] == '*');
- size_t start_content = i + (strong ? 2 : 1);
-
+ bool strong = (pos_ + 1 < input32_.length() && input32_[pos_ + 1] == '*');
+ size_t start_content = pos_ + (strong ? 2 : 1);
+
std::u32string delim = strong ? U"**" : U"*";
- size_t end = input32.find(delim, start_content);
-
- if(end != std::u32string::npos)
+ size_t end = input32_.find(delim, start_content);
+
+ if (end != std::u32string::npos)
{
flush_text();
- std::u32string content32 = input32.substr(start_content, end - start_content);
-
+ std::u32string content32 = input32_.substr(start_content, end - start_content);
+
Macro macro;
macro.name = strong ? "strong" : "em";
-
+
Group group;
- auto sub = parse(una::utf32to8(content32), prefix_markups, delimited_markups);
- for(auto& n : sub) group.addChild(std::move(n));
+ auto sub = Parser::parse(una::utf32to8(content32), prefix_markups_, delimited_markups_);
+ for (auto& n : sub) group.addChild(std::move(n));
macro.arguments.push_back(std::make_unique<Node>(std::move(group)));
-
- nodes.push_back(std::make_unique<Node>(std::move(macro)));
- i = end + delim.length();
- continue;
+
+ nodes_.push_back(std::make_unique<Node>(std::move(macro)));
+ pos_ = end + delim.length();
+ return true;
}
}
-
- current_text += c;
- i++;
+ return false;
}
-
- flush_text();
- return nodes;
+};
+
+} // namespace
+
+std::vector<std::unique_ptr<Node>> Parser::parse(
+ const std::string& input,
+ const std::vector<PrefixMarkup>& prefix_markups,
+ const std::vector<DelimitedMarkup>& delimited_markups)
+{
+ InlineParser parser(input, prefix_markups, delimited_markups);
+ return parser.parse();
}
} // namespace macrodown