#include "parser.h"
#include "macrodown.h"
#include "uni_algo/all.h"
#include <iostream>
#include <regex>
namespace macrodown
{
namespace
{
struct PrefixInfo
{
char32_t cp;
std::string macro;
std::regex pattern;
};
struct DelimInfo
{
char32_t cp;
std::string macro;
std::regex pattern;
};
// InlineParser handles the parsing of inline elements within a block of text.
// It scans the input string character by character (in UTF-32) and dispatches
// to specific handlers for different markup types.
class InlineParser
{
public:
InlineParser(const std::string& input,
const std::vector<PrefixMarkup>& prefix_markups,
const std::vector<DelimitedMarkup>& delimited_markups)
: input32_(una::utf8to32u(input)),
prefix_markups_(prefix_markups),
delimited_markups_(delimited_markups)
{
// Pre-process markup definitions into lookup tables for efficiency
for (const auto& m : prefix_markups_)
{
auto cp = una::utf8to32u(m.prefix);
if (!cp.empty())
p_infos_.push_back({cp[0], m.macro_name, std::regex(m.pattern)});
}
for (const auto& m : delimited_markups_)
{
auto cp = una::utf8to32u(m.delimiter);
if (!cp.empty())
d_infos_.push_back({cp[0], m.macro_name, std::regex(m.pattern)});
}
}
// Main parsing loop. Iterates through the input string and attempts to
// match markup elements. If a match is found, the handler advances the
// position. Otherwise, the current character is treated as plain text.
std::vector<std::unique_ptr<Node>> parse()
{
while (pos_ < input32_.length())
{
if (handleEscape()) continue;
if (handlePrefixMarkup()) continue;
if (handleDelimitedMarkup()) continue;
if (handleMacro()) continue;
if (handleCode()) continue;
if (handleLink()) continue;
if (handleEmphasis()) continue;
current_text_ += input32_[pos_];
pos_++;
}
flushText();
return std::move(nodes_);
}
private:
std::u32string input32_;
const std::vector<PrefixMarkup>& prefix_markups_;
const std::vector<DelimitedMarkup>& delimited_markups_;
std::vector<PrefixInfo> p_infos_;
std::vector<DelimInfo> d_infos_;
size_t pos_ = 0;
std::vector<std::unique_ptr<Node>> nodes_;
std::u32string current_text_;
// Pushes accumulated plain text into the node list as a Text node.
void flushText()
{
if (!current_text_.empty())
{
nodes_.push_back(std::make_unique<Node>(Text{una::utf32to8(current_text_)} ));
current_text_.clear();
}
}
// Handles backslash escapes. Consumes the backslash and appends the
// following character literally to the text buffer.
bool handleEscape()
{
if (input32_[pos_] == '\\' && pos_ + 1 < input32_.length())
{
current_text_ += input32_[pos_ + 1];
pos_ += 2;
return true;
}
return false;
}
// Handles user-defined prefix markups (e.g., #tag).
// Matches a specific prefix character and captures text until a boundary
// (whitespace or punctuation, excluding underscore).
bool handlePrefixMarkup()
{
for (const auto& info : p_infos_)
{
if (input32_[pos_] == info.cp)
{
size_t j = pos_ + 1;
while (j < input32_.length())
{
char32_t next_c = input32_[j];
if (una::codepoint::is_whitespace(next_c)) break;
if (next_c != '_' && next_c != '-' && next_c != '@' && next_c != '.' && una::codepoint::prop{next_c}.General_Category_P()) break;
j++;
}
if (j > pos_ + 1)
{
// If the last character was a dot, and it is followed by whitespace or EOF,
// exclude it from the markup.
if (input32_[j - 1] == '.')
{
if (j == input32_.length() || una::codepoint::is_whitespace(input32_[j]))
{
j--;
}
}
}
if (j > pos_ + 1)
{
flushText();
std::u32string content = input32_.substr(pos_ + 1, j - (pos_ + 1));
Macro macro;
macro.name = info.macro;
Group group;
group.addChild(std::make_unique<Node>(Text{una::utf32to8(content)}));
macro.arguments.push_back(std::make_unique<Node>(std::move(group)));
nodes_.push_back(std::make_unique<Node>(std::move(macro)));
pos_ = j;
return true;
}
}
}
return false;
}
// Handles user-defined delimited markups (e.g., :highlight:).
// Matches a delimiter character and searches for a closing delimiter.
// Enforces strict rules: no whitespace inside, no punctuation boundaries.
bool handleDelimitedMarkup()
{
for (const auto& info : d_infos_)
{
if (input32_[pos_] == info.cp)
{
size_t j = pos_ + 1;
bool valid = true;
bool found_end = false;
while (j < input32_.length())
{
char32_t next_c = input32_[j];
if (next_c == info.cp)
{
found_end = true;
break;
}
if (una::codepoint::is_whitespace(next_c))
{
valid = false;
break;
}
if (next_c != '_' && next_c != '-' && una::codepoint::prop{next_c}.General_Category_P())
{
valid = false;
break;
}
j++;
}
if (found_end && valid && j > pos_ + 1)
{
flushText();
std::u32string content = input32_.substr(pos_ + 1, j - (pos_ + 1));
Macro macro;
macro.name = info.macro;
Group group;
group.addChild(std::make_unique<Node>(Text{una::utf32to8(content)}));
macro.arguments.push_back(std::make_unique<Node>(std::move(group)));
nodes_.push_back(std::make_unique<Node>(std::move(macro)));
pos_ = j + 1;
return true;
}
}
}
return false;
}
// Handles standard macro calls (e.g., %name{arg}).
// Parses the macro name and recursively parses its arguments enclosed in {} or [].
bool handleMacro()
{
if (input32_[pos_] == '%')
{
flushText();
pos_++; // skip %
size_t name_start = pos_;
while (pos_ < input32_.length() && (una::codepoint::is_alphanumeric(input32_[pos_]) || input32_[pos_] == '_'))
{
pos_++;
}
std::u32string name32 = input32_.substr(name_start, pos_ - name_start);
std::string name = una::utf32to8(name32);
if (name.empty())
{
current_text_ += '%';
// Continue from loop
return true;
}
Macro macro;
macro.name = name;
while (pos_ < input32_.length())
{
char32_t open = input32_[pos_];
if (open == '{' || open == '[')
{
char32_t close = (open == '{') ? '}' : ']';
pos_++;
std::u32string arg_content32;
int balance = 1;
while (pos_ < input32_.length() && balance > 0)
{
if (input32_[pos_] == open)
{
balance++;
arg_content32 += input32_[pos_];
}
else if (input32_[pos_] == close)
{
balance--;
if (balance > 0) arg_content32 += input32_[pos_];
}
else
{
arg_content32 += input32_[pos_];
}
pos_++;
}
Group group;
auto sub_nodes = Parser::parse(una::utf32to8(arg_content32), prefix_markups_, delimited_markups_);
for (auto& n : sub_nodes)
{
group.addChild(std::move(n));
}
macro.arguments.push_back(std::make_unique<Node>(std::move(group)));
}
else
{
break;
}
}
nodes_.push_back(std::make_unique<Node>(std::move(macro)));
return true;
}
return false;
}
// Handles inline code blocks enclosed in backticks (`code`).
bool handleCode()
{
if (input32_[pos_] == '`')
{
size_t start = pos_ + 1;
size_t end = input32_.find('`', start);
if (end != std::u32string::npos)
{
flushText();
std::u32string content32 = input32_.substr(start, end - start);
Macro macro;
macro.name = "code";
Group group;
group.addChild(std::make_unique<Node>(Text{una::utf32to8(content32)}));
macro.arguments.push_back(std::make_unique<Node>(std::move(group)));
nodes_.push_back(std::make_unique<Node>(std::move(macro)));
pos_ = end + 1;
return true;
}
}
return false;
}
// Handles Markdown links ([text](url)).
// Recursively parses the link text.
bool handleLink()
{
if (input32_[pos_] == '[')
{
size_t label_start = pos_ + 1;
size_t j = label_start;
int bracket_bal = 1;
while (j < input32_.length() && bracket_bal > 0)
{
if (input32_[j] == '[') bracket_bal++;
else if (input32_[j] == ']') bracket_bal--;
if (bracket_bal > 0) j++;
}
if (j < input32_.length() && bracket_bal == 0)
{
size_t close_bracket = j;
if (close_bracket + 1 < input32_.length() && input32_[close_bracket + 1] == '(')
{
size_t url_start = close_bracket + 2;
size_t url_end = input32_.find(')', url_start);
if (url_end != std::u32string::npos)
{
flushText();
std::u32string label32 = input32_.substr(label_start, close_bracket - label_start);
std::u32string url32 = input32_.substr(url_start, url_end - url_start);
Macro macro;
macro.name = "link";
// Arg 1: URL
Group group1;
group1.addChild(std::make_unique<Node>(Text{una::utf32to8(url32)}));
macro.arguments.push_back(std::make_unique<Node>(std::move(group1)));
// Arg 2: Text (parsed)
Group group2;
auto sub = Parser::parse(una::utf32to8(label32), prefix_markups_, delimited_markups_);
for (auto& n : sub) group2.addChild(std::move(n));
macro.arguments.push_back(std::make_unique<Node>(std::move(group2)));
nodes_.push_back(std::make_unique<Node>(std::move(macro)));
pos_ = url_end + 1;
return true;
}
}
}
}
return false;
}
// Handles emphasis (*em*) and strong emphasis (**strong**).
// Recursively parses the content.
bool handleEmphasis()
{
if (input32_[pos_] == '*')
{
bool strong = (pos_ + 1 < input32_.length() && input32_[pos_ + 1] == '*');
size_t start_content = pos_ + (strong ? 2 : 1);
std::u32string delim = strong ? U"**" : U"*";
size_t end = input32_.find(delim, start_content);
if (end != std::u32string::npos)
{
flushText();
std::u32string content32 = input32_.substr(start_content, end - start_content);
Macro macro;
macro.name = strong ? "strong" : "em";
Group group;
auto sub = Parser::parse(una::utf32to8(content32), prefix_markups_, delimited_markups_);
for (auto& n : sub) group.addChild(std::move(n));
macro.arguments.push_back(std::make_unique<Node>(std::move(group)));
nodes_.push_back(std::make_unique<Node>(std::move(macro)));
pos_ = end + delim.length();
return true;
}
}
return false;
}
};
} // namespace
std::vector<std::unique_ptr<Node>> Parser::parse(
const std::string& input,
const std::vector<PrefixMarkup>& prefix_markups,
const std::vector<DelimitedMarkup>& delimited_markups)
{
InlineParser parser(input, prefix_markups, delimited_markups);
return parser.parse();
}
} // namespace macrodown