diff options
author | Jeremias Stotter <jeremias@stotter.eu> | 2022-04-10 19:28:10 +0200 |
---|---|---|
committer | Jeremias Stotter <jeremias@stotter.eu> | 2022-04-10 19:28:10 +0200 |
commit | 6c6151539a2e267aab06edc4c492ea6112d2215a (patch) | |
tree | 7b4dc1efa93410d7bcf284c2bf5d5fc31c8e92bd | |
parent | 6027ef98e693ccc3b06a0f7c3bc11fe2fde20a07 (diff) | |
parent | 4fd8facc45e587b3281d8f032d6f8c283c6b095b (diff) | |
download | JBlog-6c6151539a2e267aab06edc4c492ea6112d2215a.tar.gz JBlog-6c6151539a2e267aab06edc4c492ea6112d2215a.tar.bz2 JBlog-6c6151539a2e267aab06edc4c492ea6112d2215a.zip |
Merge branch 'new-md'
-rw-r--r-- | jblog.c | 3 | ||||
-rw-r--r-- | makefile | 2 | ||||
-rw-r--r-- | md.c | 1304 | ||||
-rw-r--r-- | md.h | 1 |
4 files changed, 857 insertions, 453 deletions
@@ -74,9 +74,6 @@ void* tree_root = NULL; int logfile = -1; -#define LL_INFO 0 -#define LL_WARN 1 -#define LL_ERR 2 void jb_log(int loglevel, bool include_errno, char* error_string) { // @todo implement a minimum loglevel command line switch int curr_errno = errno; @@ -1,5 +1,5 @@ CC=gcc -CFLAGS=-O2 -std=c99 -Wall +CFLAGS=-O0 -g -std=c99 -Wall BINDIR=/usr/bin INITDIR=/etc/init.d CONFDIR=/etc/conf.d @@ -21,512 +21,918 @@ #include <string.h> #include <stdlib.h> #include <stdbool.h> +#include <limits.h> + + +#include <time.h> #include "jblog.h" -#define LINE_MAX 4096 +//#define LINE_MAX 4096 // Some not so nice things about this: // HTML escapes like * are not handled by this although they might be handled by the client anyways // No setext headings because why not just use the hash symbol? -// @todo when implementing the cycling through characters replace tabs with 4 spaces +char* get_link_components(char* start, char** out_text, char** out_loc, size_t* out_len); -// This function outputs a pointer that points past leading spaces -// it returns the number of spaces skippen where \t = 4 spaces -int trim_space(char* input, char** output) { - char* trimmed = input; - int count = 0; - while(*trimmed == ' ' || *trimmed == '\t') { - if(*trimmed == '\t') { - count += 4; - } else { - count ++; +struct list_data { + // True if ol, false if ul + bool ordered; + // If ol this is set to the number the list starts counting from, in case of ul it is ignored + int start; + // The number of spaces used at this level of the list + int indent; +}; + +enum html_type { + t_root = 0, + t_inner, + t_h, // Contains a pointer to an int as its value, this pointer indicates title strength (1-6) + t_p, + t_br, + t_str_ast, // t_str contains an int pointer to its level + t_str_und, + t_img, + t_a, + t_ol, + t_list, // Value contains the ammount of spaces used to indent at that level + t_li, + t_hr, + t_code, + t_inline_code, + t_bq +}; + +// The file will be structured as a tree that, in the end will be converted to valid html +struct tree_element { + struct tree_element* parent; + int type; + int children_n; + struct tree_element** children; + // Is this element allowed to have inner html + bool allow_inner; + // Can be anythign additional, like src for img or text in the case of inner + void* value; +}; + +struct tree_element* new_element() { + return calloc(1, sizeof(struct tree_element)); +} + +void free_tree(struct tree_element* root) { + if(root->value) + free(root->value); + if(root->children) { + for(int i = 0; i < root->children_n; i++) { + free_tree(root->children[i]); } - trimmed++; + free(root->children); } - *output = trimmed; - return count; + free(root); } -// Append in_src to in_dst, then return length of new string -size_t append(char* in_dest, char* in_src) { - strcat(in_dest, in_src); - // for pure safety reasons I'll put a terminating \0 at the end of the destination string - in_dest[LINE_MAX-1] = '\0'; - return strlen(in_dest); +// Index may be -1, then we will add to the end +// Otherwise we add at location index +// +// Index may only be between -1 and parent->children_n + 1, otherwise this will crash +struct tree_element* new_child(struct tree_element* parent, int index) { + parent->children_n++; + parent->children = realloc(parent->children, sizeof(struct tree_element*)*(parent->children_n)); + struct tree_element* child = new_element(); + child->parent = parent; + if(index == -1) + (parent->children)[parent->children_n-1] = child; + else { + // Move the elements in the children list after the new element + for(int i = 0; i < (parent->children_n - index - 1); i++) { + /*memmove(parent->children + (parent->children_n - i - 1) * sizeof(struct tree_element*), + parent->children + (parent->children_n - i - 2) * sizeof(struct tree_element*), + sizeof(struct tree_element*));*/ + (parent->children)[parent->children_n - i - 1] = (parent->children)[parent->children_n - i - 2]; + } + (parent->children)[index] = child; + } + return child; } -// Prepend in_src to in_dst, then return length of new string -size_t prepend(char* in_dest, char* in_src) { - char prepended_string[LINE_MAX] = {'\0'}; - strncpy(prepended_string, in_src, LINE_MAX - 1); - strncat(prepended_string, in_dest, LINE_MAX - 1); - memcpy(in_dest, prepended_string, LINE_MAX); - // for pure safety reasons I'll put a terminating \0 at the end of the destination string - in_dest[LINE_MAX-1] = '\0'; - return strlen(in_dest); +int utf8_length; +long int unicode_char; +// Returns the bytes written +void html_escape(char* output_buffer, char input) { + if((input & 0xFFFF0000) == 0xFFFF0000) { + // UTF-8 + int leading_ones = 0; + char copy_input = input; + while(copy_input & 0b10000000) { + leading_ones++; + copy_input = copy_input << 1; + } + if(leading_ones > 1) { + utf8_length = leading_ones; + unicode_char = 0x0; + } + unicode_char = unicode_char << (8 - leading_ones -1); + unicode_char = unicode_char | ( input & (UCHAR_MAX >> leading_ones) ); + utf8_length--; + + if(utf8_length) + *output_buffer = '\0'; + else { + snprintf(output_buffer, 96, "&#x%lX;", unicode_char); + } + } else { + snprintf(output_buffer, 96, "&#x%X;", input); + } } -// start is the first curly bracket we have -// Returns pointer to last round bracket -// free the returned pointers yourself -char* get_link_components(char* start, char** out_text, char** out_loc, size_t* out_len) { - // Search for the next closing bracket - char* closing_sqr_bracket = strchr(start, ']'); - if(closing_sqr_bracket == NULL) { - return NULL; +// Reallocs the string dest to fit src, then append +char* realloc_append(char* dest, char* src) { + //printf("Dest:%sSrc:%s\n", dest ? dest : "", src ? src : ""); + size_t dest_len = dest ? strlen(dest) : 0; + dest = realloc(dest, dest_len + (src ? strlen(src) : 0) + 1); + if(src) + strcpy(dest + dest_len, src); + return dest; +} + +#define p_html "<p>%s</p>\n" +#define br_html "<br/>" +#define h_html "<h%d>%s</h%d>\n" +#define em_html "<em>%s</em>\n" +#define b_html "<b>%s</b>\n" +#define emb_html "<em><b>%s</b></em>\n" +#define img_html "<img src=\"%s\" alt=\"%s\" title=\"\"/>\n" +#define a_html "<a href=\"%s\">%s</a>\n" +#define ul_html "<ul>%s</ul>\n" +#define ol_html "<ol start=\"%d\">%s</ol>\n" +#define li_html "<li>%s</li>\n" +#define hr_html "<hr/>\n" +#define code_html "<pre><code>%s</code></pre>\n" +#define inline_code_html "<code>%s</code>" +#define bq_html "<blockquote>%s</blockquote>\n" +// Resolve a tree to html +int depth = 0; +char* tree_to_html(struct tree_element* root) { + char* html=NULL; + char* inner_html = NULL; + for(int i = 0; i < root->children_n; i++) { + struct tree_element* child = root->children[i]; + switch(child->type) { + case(t_inner): + // Append the inner html to buffer + inner_html = realloc_append(inner_html, child->value); + break; + default: + char* child_html = tree_to_html(child); + inner_html = realloc_append(inner_html, child_html); + free(child_html); + break; + } } - if(*(closing_sqr_bracket + 1) != '(') { - return NULL; +// printf("%s\n", inner_html ? inner_html : "(EMPTY)"); + #define realloc_len(html_pattern) (inner_html ? strlen(inner_html) : 0) + strlen(html_pattern) + 1 + #define realloc_for_html(html_pattern) realloc(html, realloc_len(html_pattern)); + switch(root->type) { + case(t_p): + html = realloc_for_html(p_html); + sprintf(html, p_html, inner_html ? inner_html : ""); + break; + case(t_h): + html = realloc_for_html(h_html); + sprintf(html, h_html, *(int*)(root->value), inner_html ? inner_html : "", *(int*)(root->value)); + break; + case(t_str_ast): + case(t_str_und): + switch(*(unsigned int*)root->value) { + case(1): + html = realloc_for_html(em_html); + sprintf(html, em_html, inner_html ? inner_html : ""); + break; + case(2): + html = realloc_for_html(b_html); + sprintf(html, b_html, inner_html ? inner_html : ""); + break; + case(3): + html = realloc_for_html(emb_html); + sprintf(html, emb_html, inner_html ? inner_html : ""); + break; + default: + html = realloc_for_html(""); + strcpy(html, inner_html ? inner_html : ""); + break; + } + break; + case(t_img): + html = realloc(html, realloc_len(img_html) + strlen(root->value)); + if(!html) + exit(-1); + sprintf(html, img_html, root->value ? (char*)(root->value) : "", inner_html ? inner_html : ""); + break; + case(t_a): + html = realloc(html, realloc_len(a_html) + strlen(root->value)); + if(!html) + exit(-1); + sprintf(html, a_html, root->value ? (char*)(root->value) : "", inner_html ? inner_html : ""); + break; + case(t_br): + html = realloc_for_html(br_html); + strcpy(html, br_html); + break; + case(t_list): + if(((struct list_data*)(root->value))->ordered) { + // We have an ordered list + int digits = 0; + for(int counter = 1; counter < ((struct list_data*)(root->value))->start; counter *= 10, digits++); + html = realloc(html, realloc_len(ol_html) + digits + 1); + sprintf(html, ol_html, ((struct list_data*)(root->value))->start, inner_html ? inner_html : ""); + } else { + // We have an unordered list + html = realloc_for_html(ul_html); + sprintf(html, ul_html, inner_html ? inner_html : ""); + } + break; + case(t_li): + html = realloc_for_html(li_html); + sprintf(html, li_html, inner_html ? inner_html : ""); + break; + case(t_hr): + html = realloc_for_html(hr_html); + strcpy(html, hr_html); + break; + case(t_code): + html = realloc_for_html(code_html); + sprintf(html, code_html, inner_html ? inner_html : ""); + break; + case(t_inline_code): + if(*(bool*)(root->value)) { + html = realloc_for_html(inline_code_html); + sprintf(html, inline_code_html, inner_html ? inner_html : ""); + } else { + html = realloc(html, inner_html ? strlen(inner_html) : 0 + 1); + memcpy(html, "`", 2); + strcat(html, inner_html ? inner_html : ""); + } + break; + case(t_bq): + html = realloc_for_html(bq_html); + sprintf(html, bq_html, inner_html ? inner_html : ""); + break; + default: + html=inner_html; + break; } - char* closing_rnd_bracket = strchr(closing_sqr_bracket + 1, ')'); - if(closing_rnd_bracket == NULL) { - return NULL; + return html; +} + +// Use this to create a new child of another element and quickly set its atributes +#define NEW_ACTIVE_CHILD(ae_var, parent, index, type_v, allow_inner_v) ae_var = new_child(parent, index); ae_var->type = type_v; ae_var->allow_inner = allow_inner_v; + +// Use this to find the next parent that allows inner elements +#define NEXT_ALLOW_INNER(active_element, root) while(!(active_element == root) && (!active_element->allow_inner && active_element)) active_element = active_element->parent; + +// This appends the cur_char to active element / it creates a new active element if the active element can not have text +void append_char_to_active(struct tree_element* root, struct tree_element** active_element, char cur_char) { + // We are not allowed to add inner to this element so we'll start a new paragraph + struct tree_element* new_active_element = active_element ? *active_element : root; + if(active_element) { + NEXT_ALLOW_INNER((*active_element), root) + if(*active_element == root) { + NEW_ACTIVE_CHILD(new_active_element, root, -1, t_p, true); + } } - size_t link_text_len = closing_sqr_bracket - start - 1; - size_t link_loc_len = closing_rnd_bracket - closing_sqr_bracket - 2; - - char* link_text = calloc(link_text_len + 1, 1); - if(link_text == NULL) { - jb_log(LL_ERR, true, "calloc error"); - return NULL; + if(new_active_element->type != t_inner) { + NEW_ACTIVE_CHILD(new_active_element, new_active_element, -1, t_inner, false); } - char* link_loc = calloc(link_loc_len + 1, 1); - if(link_text == NULL) { - jb_log(LL_ERR, true, "calloc error"); - free(link_text); - return NULL; + if((cur_char <= 47 || + (cur_char >=58 && cur_char <=64) || + (cur_char >=91 && cur_char <=96) || + cur_char > 122) && cur_char != 0x20) { + // Escape just to be safe + char append[96] = ""; + html_escape(append, cur_char); + new_active_element->value = realloc_append(new_active_element->value, append); + } else { + // This is stupid, improve this later xD + char append[2] = {cur_char, '\0'}; + new_active_element->value = realloc_append(new_active_element->value, append); + } + if(active_element) + *active_element = new_active_element; +} + +// This adds a new unordered list and returns a pointer to it +// indent is the spaces that the list indicator ( * or - ) was intendend +struct tree_element* new_list(struct tree_element* parent, int indent, bool ordered, int start) { + struct tree_element *new_list; + NEW_ACTIVE_CHILD(new_list, parent, -1, t_list, false); + struct list_data* data = malloc(sizeof(struct list_data)); + data->indent = indent; + data->ordered = ordered; + data->start = start; + new_list->value = data; + return new_list; +} + +// Walk up the tree until root, stop if we encounter the requested type and return that node, otherwise return NULL +struct tree_element* find_parent_type(struct tree_element* root, struct tree_element* start, int type) { + struct tree_element* check_element = start; + while(check_element != root) { + if(check_element->type == type) + return check_element; + check_element = check_element->parent; + } + return NULL; +} + +// These store temporary strength values +unsigned int temp_str_ast = 0; +unsigned int temp_str_und = 0; + +// These are true if a strength element is waiting for closing +bool str_cl_wait_ast = false; +bool str_cl_wait_und = false; + +// These are true if a strength element waits for a different character +bool str_chr_wait_ast = false; +bool str_chr_wait_und = false; + +bool str_fin_wait_ast = false; +bool str_fin_wait_und = false; + +#define ZERO_STR_AST temp_str_ast = 0; str_cl_wait_ast = false; str_chr_wait_ast = false; str_fin_wait_ast = false; +#define ZERO_STR_UND temp_str_und = 0; str_cl_wait_und = false; str_chr_wait_und = false; str_fin_wait_und = false; + + +void end_strength(struct tree_element* root, struct tree_element** active_element, char marker, int tmp_str, int type) { + struct tree_element* parent_strength = find_parent_type(root, *active_element, type); + if(!parent_strength) + return; + if(tmp_str < *(unsigned int*)parent_strength->value) { + struct tree_element* tmp_active = parent_strength->parent; + for(int i = tmp_str; i < *(unsigned int*)parent_strength->value; i++) + append_char_to_active(root, &tmp_active, marker); + *(unsigned int*)parent_strength->value = tmp_str; + } else if(tmp_str > *(unsigned int*)parent_strength->value) { + struct tree_element* tmp_active; + if((parent_strength->parent)->children_n >= 2) + tmp_active = &(*(parent_strength->parent)->children[(parent_strength->parent)->children_n-2]); + else { + NEW_ACTIVE_CHILD(tmp_active, parent_strength->parent, 0, t_inner, false); + } + //printf("\naa:%d\n", tmp_str); + for(int i = *(unsigned int*)parent_strength->value; i < tmp_str; i++) + append_char_to_active(root, &tmp_active, marker); } +} + +// Call this function if a strength character is waiting to hit a different character +void str_wait_hit(struct tree_element* root, struct tree_element** active_element, bool* str_cl_wait, bool* str_fin_wait, unsigned int* temp_str, bool* str_chr_wait, char str_chr, int type) { + if(*str_cl_wait) { + if(*str_fin_wait) { + end_strength(root, active_element, str_chr, *temp_str, type); + *str_fin_wait = false; + *temp_str = 0; + } + *str_cl_wait = false; + } else + *str_cl_wait = true; - memcpy(link_text, start + 1, link_text_len); - memcpy(link_loc, closing_sqr_bracket + 2, link_loc_len); - *out_text = link_text; - *out_loc = link_loc; - *out_len = link_text_len + link_loc_len; + *str_chr_wait = false; +} - return closing_rnd_bracket; +void str_chr_hit(struct tree_element* root, struct tree_element** active_element, bool* str_cl_wait, bool* str_fin_wait, unsigned int* temp_str, bool* str_chr_wait, int desired_type) { + struct tree_element* parent_strength = find_parent_type(root, *active_element, desired_type); + if(parent_strength) { + // We are already in a strength element + // Check if we are waiting to close + if(*str_cl_wait) { + (*(unsigned int*)(parent_strength->value))++; + if((*(unsigned int*)(parent_strength->value)) >= *temp_str) { + *active_element = parent_strength->parent; + ZERO_STR_AST + return; + } else { + *str_fin_wait = true; + } + } else { + (*temp_str)++; + } + } else { + // Enter a new strength element as we are currently not in one + if((*active_element)->type == t_inner) + *active_element = (*active_element)->parent; + if(*active_element == root) { + NEW_ACTIVE_CHILD((*active_element), root, -1, t_p, true); + } + NEW_ACTIVE_CHILD((*active_element), (*active_element), -1, desired_type, true); + (*active_element)->value = calloc(sizeof(unsigned int), 1); + *temp_str = 1; + } + *str_chr_wait = true; } -// @todo NEXT // The program needs to loop through the loop again to cose all the open things at the end, THIS NEEDS TO BE IMPLEMENTED for xhtml int parse_markdown(char* input, char* buffer, size_t buffer_size) { - bool empty_line_carry = true; - bool spaced_codeblock = false; - bool in_paragraph = false; - // This is the offset in parsed_text - size_t offset = 0; - // This describes the spaces the last list level had - int list_spaces = 0; - int list_level = 0; - bool in_list = false; - char* next_line = input; - // Different indicators for all the inline things - // How strongly are we currently emphasized? * = 1; ** = 2; *** = 3 - int strength_level = 0; - bool in_mono = false; - bool in_cut = false; - bool in_quotes = false; - // Signal we are on the last line, for safety reasons we just insert an empty paragraph - bool on_lastline = false; - for(char* line = input; next_line != NULL; line = next_line) { - // Compute the next line and put a '\0' at the end of the current line - { - char* line_end = strchr(line, '\n'); - if(line_end != NULL) { - *line_end = '\0'; - next_line = line_end + 1; + utf8_length = 0; + + clock_t before = clock(); + memset(buffer, 0, buffer_size); + bool escaped = false; + bool newline = false; + bool list_waiting = false; + bool ol_list = false; + int ol_start = 0; + // This will be set to some non-null value when there is a code element to return to + struct tree_element* code_element = NULL; + + int hash_chain = 0; + int dash_chain = 0; + int eq_chain = 0; + int spaces_trimmed = 0; + int root_on_newline = false; + int soft_newline_count = 0; + + bool inline_code_wait = false; + bool fenced_code = false; + char fenced_char = '\0'; + + // @todo: These two variables should be globals! Why did I even define them here at all? + struct tree_element* root = new_element(); + root->allow_inner = false; + struct tree_element* active_element = root; + for(register char* cur_char = input; *cur_char != '\0'; cur_char++) { + printf("%c", *cur_char); + if(escaped || (code_element && *cur_char != '\n' && *cur_char != '\r' && !newline)) { + escaped = false; + if((*cur_char == '\n' || *cur_char == '\r')) { + if(active_element->parent) + active_element = active_element->parent; + struct tree_element* br_child = NEW_ACTIVE_CHILD(br_child, active_element, -1, t_br, false); } else { - empty_line_carry = true; - on_lastline = true; + append_char_to_active(root, &active_element, *cur_char); } + continue; } - size_t line_length = 0; - char line_buffer[LINE_MAX] = {'\0'}; - bool no_special = true; - // The pre_line_buffer contains html headers that are needed before content of the current line - char pre_line_buffer[LINE_MAX] = "\0"; - char* trimmed_line = NULL; - // When this is true we can format text with things like <em>, <b>, etc. - bool format_allow = true; - int spaces_skipped = 0; - bool empty_line = false; - // Check if we have an empty line - if(empty_line_carry) { - empty_line_carry = false; - empty_line = true; - } - - spaces_skipped = trim_space(line, &trimmed_line); - if(!on_lastline) { - if(*trimmed_line == '\0' || *trimmed_line == '\r') { - empty_line_carry = true; - continue; - } + // man is this disgusting + if(fenced_code) { + if((*cur_char == '\n' || *cur_char == '\r') && + *(cur_char + 1) == fenced_char && *(cur_char + 2) == fenced_char && *(cur_char + 3) == fenced_char) { + cur_char += 3; + active_element = root; + fenced_code = false; + } else + append_char_to_active(root, &active_element, *cur_char); + continue; } - if(on_lastline) { - empty_line = true; - line = "\0"; + if(str_chr_wait_ast && *cur_char != '*') { + str_wait_hit(root, &active_element, &str_cl_wait_ast, &str_fin_wait_ast, &temp_str_ast, &str_chr_wait_ast, '*', t_str_ast); + } + + if(str_chr_wait_und && *cur_char != '_') { + str_wait_hit(root, &active_element, &str_cl_wait_und, &str_fin_wait_und, &temp_str_und, &str_chr_wait_und, '_', t_str_und); } - // Blockquotes - /* - I love me some spaghetti bolognese ;P - Luckily there is enough spaghetti here - for everyone - - ----| - ----|------------ - \//\----| - ||/\||\ - |/|||||\ - --------------- - \ / - \-----------/ - */ - if(*trimmed_line == '>') { - // We are in a blockquote! - // Set the new line to after the quote marker - line = trimmed_line + 1; - // trim again - spaces_skipped = trim_space(line, &trimmed_line); - if(!in_quotes) { - if(in_paragraph) { - strncat(pre_line_buffer, "</p><blockquote>", LINE_MAX -1); - in_paragraph = false; - } else { - line_length = prepend(line_buffer, "<blockquote><p>"); - in_paragraph = true; - } - in_quotes = true; - } - } else if(in_quotes) { - if(in_paragraph) { - strncat(pre_line_buffer, "</p></blockquote>", LINE_MAX-1); - in_paragraph = false; - } else { - strncat(pre_line_buffer, "</blockquote>", LINE_MAX-1); - } - in_quotes = false; + #define LAST_TO_TITLE(n) {\ + if(root->children_n >= 1) {\ + struct tree_element* last_element = root->children[root->children_n-1];\ + last_element->type = t_h;\ + if(last_element->value)\ + free(last_element->value);\ + last_element->value = malloc(sizeof(int));\ + *(int*)last_element->value = n;\ + }\ } - // / Blockquotes - - // Unordered Lists ------------------------------------------------ - if(spaces_skipped < 4 + list_spaces && *trimmed_line == '*' && *(trimmed_line + 1) == ' ') { - line_length = prepend(line_buffer, "<li>"); - if(!in_list) { - line_length = prepend(line_buffer, "<ul>"); - in_list = true; - } else { - strncat(pre_line_buffer, "</li>\n", LINE_MAX-1); - } - if(spaces_skipped < list_spaces) { - for(int i = (list_spaces - spaces_skipped) >> 1; i > 0; i--) { - strncat(pre_line_buffer, "</ul>\n", LINE_MAX-1); - list_level--; - } - } else if(spaces_skipped > list_spaces) { - for(int i = (spaces_skipped - list_spaces) >> 1; i > 0; i--) { - line_length = prepend(line_buffer, "<ul>"); - list_level++; - } - } - trimmed_line = trimmed_line + 2; - list_spaces = spaces_skipped; - } else if(in_list && empty_line) { - strncat(pre_line_buffer, "</li></ul>\n", LINE_MAX-1); - for(int i = 0; i < list_level; i++) { - strncat(pre_line_buffer, "</ul>\n", LINE_MAX-1); - } - in_list = false; - list_spaces = 0; + #define APPEND_SPACES for(int i = 0; i < soft_newline_count; i++) append_char_to_active(root, &active_element, ' '); + + // Checks that should be done if a non special character is hit, might also be necessary to check sometimes not in default + #define DEFAULT_CHECKS {\ + APPEND_SPACES \ + soft_newline_count = 0; \ + if(spaces_trimmed >= 4 && (newline || active_element == root)) { \ + /* if code_element is set the new active element is just returned to the code element*/ \ + if(code_element) { \ + active_element = code_element;\ + /* append_char_to_active(root, &active_element, '\n'); */\ + } else {\ + NEW_ACTIVE_CHILD(active_element, root, -1, t_code, true);\ + code_element = active_element;\ + }\ + for(int i = 0; i < spaces_trimmed - 4; i++)\ + append_char_to_active(root, &active_element, ' ');\ + } else if(code_element) {\ + active_element = root;\ + code_element = false;\ + }\ + if(hash_chain > 0) {\ + for(int i = 0; i < hash_chain; i++)\ + append_char_to_active(root, &active_element, '#');\ + hash_chain = 0;\ + }\ + if(dash_chain > 0) {\ + for(int i = 0; i < dash_chain; i++)\ + append_char_to_active(root, &active_element, '-');\ + dash_chain = 0;\ + }\ + if(eq_chain > 0) {\ + for(int i = 0; i < eq_chain; i++)\ + append_char_to_active(root, &active_element, '=');\ + eq_chain = 0;\ + }\ + \ + if(newline) {\ + if(root_on_newline) {\ + active_element = root;\ + root_on_newline = false;\ + } else if(!code_element)\ + /* Check if we have a soft linebreak (two spaces before newline) */\ + append_char_to_active(root, &active_element, ' ');\ + }\ } - // / Unordered List ----------------------------------------------- - // @todo Ordered list - - // Spaced codeblocks ---------------------------------------------- - // Also check if we have a list - if(spaces_skipped >= 4 && !in_list) { - // If this is true we are already in a codeblock - if(spaced_codeblock) { - line_length = prepend(line_buffer, "\n"); - no_special = false; - format_allow = false; - } else { - line_length = prepend(line_buffer, "<pre><code>"); - line_length = append(line_buffer, "<p>"); - spaced_codeblock = true; - no_special = false; - format_allow = false; - } // Replace any leading tabs with 4 spaces - char* code_replaced = line; - for(int i = 0; i < 4;) { - if(*(line + i) == '\t') { - code_replaced ++; - i += 4; - } else if(*(line + i) == ' ') { - code_replaced++; - i++; - } else { + switch(*cur_char) { + // Character escaping + case('\\'): + escaped = true; + break; + // Tabs + case('\t'): + if(active_element == root || newline) { + //@todo with this we should relatively easily be able to check for code blocks! + spaces_trimmed += 4; break; + } else goto default2; + + break; + // Newline + case('\r'): + // Ignoring \r goes against the commonmark spec, but who cares + break; + case('\n'): + list_waiting = false; + if(dash_chain >= 1) { + active_element = root; + // Make the last element a title unless + if(!newline) { + if(dash_chain >= 3 && active_element == root) { + struct tree_element* hr = NEW_ACTIVE_CHILD(hr, root, -1, t_hr, false); + } else { + for(int i = 0; i < dash_chain; i++) + append_char_to_active(root, NULL, '-'); + } + } else { + LAST_TO_TITLE(2); + } + dash_chain = 0; } - } - line_length = append(line_buffer, code_replaced); - } else if(spaced_codeblock) { - spaced_codeblock = false; - strncat(pre_line_buffer, "</p></code></pre>\n", LINE_MAX-1); - } - // / Spaced codeblocks -------------------------------------------- - - - // Titles (#) ----------------------------------------------------- - // @todo allow enclosing titles bracket style - if(no_special && *trimmed_line == '#') { - int header_depth = 1; - while(header_depth < 6) { - if(*(trimmed_line + header_depth) == '#') - header_depth++; - else - break; - } - char* trimmed_title = NULL; - if(trim_space(trimmed_line + header_depth, - &trimmed_title) > 0) { - no_special = false; - - char title[LINE_MAX + 1] = ""; - if(snprintf(title, LINE_MAX + 1, "<h%d>%s</h%d>\n", - header_depth, - trimmed_title, - header_depth - ) > LINE_MAX) - { - jb_log(LL_WARN, false, "Title too long"); - return -1; + if(eq_chain >= 1) { + active_element = root; + if(newline) { + LAST_TO_TITLE(1); + } else { + for(int i = 0; i < eq_chain; i++) + append_char_to_active(root, NULL, '='); + } + eq_chain = 0; } - line_length = append(line_buffer, title); - } - // / Titles ------------------------------------------------------- - // Thematic brakes ------------------------------------------------ - } else if(no_special && (*trimmed_line == '-' || *trimmed_line == '*' || *trimmed_line == '_')) { - int i = 1; - for(; *(trimmed_line+i) == *(trimmed_line+i-1); i++) { - } - if(*(trimmed_line+i) == '\0') { - no_special = false; - line_length = append(line_buffer, "<hr/>\n"); - } - } - // / Thematic brakes ---------------------------------------------- - - // Plain Text ----------------------------------------------------- - if(no_special) { - line_length = append(line_buffer, trimmed_line); - // Handle newline via slash/* - if(*(line_buffer + line_length -1) == '\\') { - *(line_buffer + line_length -1) = '\n'; - if(*(line_buffer + line_length -2) != '\\') { - empty_line_carry = true; + if(hash_chain > 0) + active_element = root; + if(newline) { + // A double new line means we return the active element to root + active_element = root; + newline = false; + root_on_newline = false; + code_element = NULL; + active_element = root; + } else if(active_element != root) { + if(soft_newline_count < 2) + newline = true; + else { + struct tree_element* old_active = active_element; + NEXT_ALLOW_INNER(active_element, root) + NEW_ACTIVE_CHILD(active_element, active_element, -1, t_br, false); + active_element = old_active; + NEXT_ALLOW_INNER(active_element, root) + } } - } - // Handle newlines - if(empty_line && in_paragraph) { - strncat(pre_line_buffer, "</p>\n", LINE_MAX-1); - in_paragraph = false; - } - // If we are not in a paragraph enter one - if(!in_paragraph && !in_list) { - line_length = prepend(line_buffer, "<p>"); - in_paragraph = true; - } else { // If we are in a paragraph the newline will be converted to a space - line_length = prepend(line_buffer, " "); - } - } else if(in_paragraph) { - line_length = prepend(line_buffer, "</p>\n"); - in_paragraph = false; - } + + //printf("%d", temp_str_ast); + if(temp_str_ast > 0) + end_strength(root, &active_element, '*', temp_str_ast, t_str_ast); + ZERO_STR_AST + //printf("%d\n", temp_str_und); + if(temp_str_und > 0) + end_strength(root, &active_element, '_', temp_str_und, t_str_und); + ZERO_STR_UND - if(format_allow) { - bool escaped = false; - char format_line_buffer[LINE_MAX] = {0}; - for(char* line_position = line_buffer; *line_position != '\0'; line_position++) { - // Handle a previously escaped character - if(escaped) { - char escaped_char[2] = {*line_position, '\0'}; - // Also check if we have an escaped newline - strncat(format_line_buffer, *line_position == '\n' ? "\\" : escaped_char, LINE_MAX-1); - escaped = false; - continue; - } - switch(*line_position) { - // Text strength, for sake of simplicity, we'll treat * and _ the same - case '*' : - case '_' : - { - // Look ahead if the next ones are also strength indicators - int strength_indicators = 0; - for(; strength_level > 0 ? strength_indicators<strength_level : true ; strength_indicators++) { - if(!(*(line_position + strength_indicators) == '*' || - *(line_position + strength_indicators) == '_')) { - break; - } - } + if(code_element) + append_char_to_active(root, &active_element, '\n'); - if(strength_level == 0) { - if(strength_indicators == 1) { - strncat(format_line_buffer, "<em>", LINE_MAX-1); - } else if(strength_indicators == 2) { - strncat(format_line_buffer, "<b>", LINE_MAX-1); - } else { - strncat(format_line_buffer, "<b><em>", LINE_MAX-1); - } - strength_level = strength_indicators; - line_position += strength_indicators -1; - } else { - if(strength_level >= 3 && strength_indicators >= 3) { - strncat(format_line_buffer, "</em></b>", LINE_MAX-1); - } else if(strength_level == 2 && strength_indicators >= 2) { - strncat(format_line_buffer, "</b>", LINE_MAX-1); - } else { - strncat(format_line_buffer, "</em>", LINE_MAX-1); - } - strength_level = strength_level - strength_indicators; - line_position += strength_indicators -1; - } - } + inline_code_wait = false; + hash_chain = 0; + spaces_trimmed = 0; + soft_newline_count = 0; + break; + // Numbered lists + case('1'): + case('2'): + case('3'): + case('4'): + case('5'): + case('6'): + case('7'): + case('8'): + case('9'): + case('0'): + if((active_element == root || newline) && *(cur_char+1) == '.' && *(cur_char+2) == ' ') { + list_waiting = true; + ol_list = true; + ol_start = 0; + sscanf(cur_char, "%d.", &ol_start); + cur_char++; + } else + goto default2; + break; + case('*'): + if((active_element == root || newline) && *(cur_char+1) == ' ' ) { + list_waiting = true; + } else { + str_chr_hit(root, &active_element, &str_cl_wait_ast, &str_fin_wait_ast, &temp_str_ast, &str_chr_wait_ast, t_str_ast); + } + break; + case('_'): + str_chr_hit(root, &active_element, &str_cl_wait_und, &str_fin_wait_und, &temp_str_und, &str_chr_wait_und, t_str_und); + break; + // No 3 backticks are supported, use 4 spaces at the begining of a line to get a <pre><code> block + + case('`'): + case('~'): + // Check for fenced code + if(active_element == root || newline) { + if(*cur_char == *(cur_char + 1) && *(cur_char + 2)) { + NEW_ACTIVE_CHILD(active_element, root, -1, t_code, true); + fenced_code = true; + fenced_char = *cur_char; + // Ignore the rest of the line + while(*cur_char != 0 && *cur_char != '\n') + cur_char++; break; - // Inline code - case '`': - { - if(in_mono) { - strncat(format_line_buffer, "</code>", LINE_MAX - 1); - in_mono = false; - } else { - strncat(format_line_buffer, "<code>", LINE_MAX - 1); - in_mono = true; - } - } + } + } + if(*cur_char == '~') + goto default2; + DEFAULT_CHECKS; + if(inline_code_wait) { + struct tree_element* parent_code = find_parent_type(root, active_element, t_inline_code); + if(!parent_code) + goto default2; + *(bool*)(parent_code->value) = true; + active_element = parent_code->parent; + inline_code_wait= false; + } + else { + NEXT_ALLOW_INNER(active_element, root); + if(active_element == root) { + NEW_ACTIVE_CHILD(active_element, root, -1, t_p, true); + } + NEW_ACTIVE_CHILD(active_element, active_element, -1, t_inline_code, true); + active_element->value = malloc(sizeof(bool)); + *(bool*)(active_element->value) = false; + inline_code_wait = true; + } + break; + case('['): + char* link_text = NULL; + char* link_loc = NULL; + size_t link_len = 0; + char* new_position = get_link_components(cur_char, &link_text, &link_loc, &link_len); + if(new_position) { + APPEND_SPACES + if(active_element == root) { + NEW_ACTIVE_CHILD(active_element, active_element, -1, t_p, true); + } + NEXT_ALLOW_INNER(active_element, root) + // We have a link + // Create a new a element which contains the link adress + NEW_ACTIVE_CHILD(active_element, active_element, -1, t_a, true); + active_element->value = link_loc; + // Create an inner element in it which contains the text + NEW_ACTIVE_CHILD(active_element, active_element, -1, t_inner, false); + active_element->value = link_text; + // Return to the old parent + active_element = active_element->parent->parent; + cur_char = new_position; + break; + } else + goto default2; + case('!'): + char* alt_text = NULL; + char* img_loc = NULL; + size_t img_len = 0; + new_position = get_link_components(cur_char + 1, &alt_text, &img_loc, &img_len); + if(new_position) { + NEXT_ALLOW_INNER(active_element, root) + // This contains the image link + NEW_ACTIVE_CHILD(active_element, active_element, -1, t_img, true); + active_element->value = img_loc; + // This contains the image alt text + NEW_ACTIVE_CHILD(active_element, active_element, -1, t_inner, false); + active_element->value = alt_text; + active_element = active_element->parent->parent; + cur_char = new_position; + break; + } else + goto default2; + break; + // Titles + case('='): + if(active_element == root || newline) { + // Make the last line a title, otherwise we fall through + eq_chain++; + break; + } + case('-'): + if(active_element == root || newline) { + list_waiting = true; + dash_chain++; + } + break; + case('#'): + if(hash_chain == 0 || newline) { + if(active_element == root || newline || active_element->type == t_li) { + if(!(active_element->type == t_li)) + active_element = root; + hash_chain = 1; + newline = false; break; - // Cut text - case '~': - { - // Look ahead, we only want to cut if there are two tildes - if(*(line_position + 1) != '~') { - strncat(format_line_buffer, "~", LINE_MAX - 1); - break; - } - if(in_cut) { - // Close - strncat(format_line_buffer, "</s>", LINE_MAX - 1); - in_cut = false; - } else { - in_cut = true; - strncat(format_line_buffer, "<s>", LINE_MAX - 1); - } - line_position++; - } + } + } else { + hash_chain++; + break; + } + case('>'): + if(newline || active_element == root) { + // Look if we already have a block quote parent somewhere, if so just continue, otherwise create a new one. + if(find_parent_type(root, active_element, t_bq)) break; - // brackets - case '[': - { - char* link_text = NULL; - char* link_loc = NULL; - size_t link_loctxt_len = 0; - char* closing_rnd_bracket = get_link_components(line_position, &link_text, &link_loc, &link_loctxt_len); - // This is 16 characters long (add one for good measure (: ): - //<a href=""></a>\0 - char* link_html = calloc(link_loctxt_len + 17, 1); - if(link_html == NULL || closing_rnd_bracket == NULL) { - strncat(format_line_buffer, "[", LINE_MAX - 1); - if(link_html != NULL) free(link_html); - if(link_text != NULL) free(link_text); - if(link_loc != NULL) free(link_loc); - break; - } - - sprintf(link_html, "<a href=\"%s\">%s</a>", link_loc, link_text); + NEW_ACTIVE_CHILD(active_element, root, -1, t_bq, true); + break; + } else + goto default2; + case('<'): + char* closing_gt = cur_char; + while(*closing_gt != 0 && *closing_gt != '\n') { + if(*closing_gt == '>') break; + closing_gt++; + } + if(*closing_gt != '>') + goto default2; - strncat(format_line_buffer, link_html, LINE_MAX - 1); - line_position = closing_rnd_bracket; - free(link_html); - free(link_text); - free(link_loc); + NEW_ACTIVE_CHILD(active_element, active_element->allow_inner ? active_element : root, -1, t_inner, false); + active_element->value = malloc(closing_gt - cur_char + 1); + memcpy(active_element->value, cur_char, closing_gt - cur_char + 1); + *(char*)((active_element->value) + (unsigned int)(closing_gt - cur_char) + 1) = 0; + cur_char = closing_gt; + active_element = active_element->parent; + case(' '): + if(hash_chain > 0) { + if(active_element->type != t_h) { + if(active_element->parent ? active_element->parent->type == t_h : false) { + active_element = active_element->parent; + } else { + NEW_ACTIVE_CHILD(active_element, active_element, -1, t_h, true); + active_element->value = malloc(sizeof(int)); } - break; - // Images - case '!': - { - char* img_alt = NULL; - char* img_src = NULL; - size_t img_altsrc_len = 0; - if(*(line_position + 1) == '\0') { - strncat(format_line_buffer, "!", LINE_MAX - 1); + } + *(int*)(active_element->value) = hash_chain; + hash_chain = 0; + root_on_newline = true; + break; + } + // Here lists are created + if(list_waiting && dash_chain <= 1) { + dash_chain = 0; + newline = false; + /* Look if we have an ancestor somewhere that has spaces fewer or equal to the spaces we skipped. + * On fewer spaces we enter a new list below the one we found + * Is it equal we just add a new list item + * If we do not find a list we create a new one at the root node */ + struct tree_element* look_element = find_parent_type(root, active_element, t_list);; + bool found_list = false; + while(look_element != NULL){ + // Out list has more indents and is therefore a child to the one we found + // Enter new child list + if(((struct list_data*)look_element->value)->indent < spaces_trimmed) { + active_element = new_list(look_element, spaces_trimmed, ol_list, ol_start); + found_list = true; + break; + } + // We found a list of the exact indentation level + else if(((struct list_data*)look_element->value)->indent == spaces_trimmed) { + // If the list type is the same simply mark the found element as active, otherwise create a new sibling list + if(((struct list_data*)look_element->value)->ordered == ol_list) { + active_element = look_element; + found_list = true; + } else { + active_element = new_list(look_element->parent, spaces_trimmed, ol_list, ol_start); + found_list = true; break; } - char* closing_rnd_bracket = get_link_components(line_position + 1, &img_alt, &img_src, &img_altsrc_len); - // This is 21 characters long, again we add one more to be safe - //<img alt="" src=""/>\0 - char* img_html = calloc(img_altsrc_len + 22, 1); - if(img_html == NULL || closing_rnd_bracket == NULL) { - strncat(format_line_buffer, "!", LINE_MAX - 1); - if(img_html != NULL) free(img_html); - if(img_alt != NULL) free(img_alt); - if(img_src != NULL) free(img_src); - break; - } - sprintf(img_html, "<img alt=\"%s\" src=\"%s\"/>", img_alt, img_src); - - strncat(format_line_buffer, img_html, LINE_MAX - 1); - line_position = closing_rnd_bracket; - free(img_html); - free(img_alt); - free(img_src); - } - break; - // escape - case '\\': - { - escaped = true; - } - break; - default: - { - // See how much text we have - size_t text_found_n = strcspn(line_position, "*_`[!\\~"); - char text_found[LINE_MAX] = {'\0'}; - memcpy(text_found, line_position, text_found_n); - text_found[text_found_n+1] = '\0'; - append(format_line_buffer, text_found); - line_position += text_found_n-1; - } + break; + } + // The current list has fewer indents than what we found, look farther for a parent + else if(((struct list_data*)look_element->value)->indent > spaces_trimmed) + look_element = find_parent_type(root, look_element->parent, t_list); + else + look_element = find_parent_type(root, look_element, t_list); + } + // Enter a new list + if(!found_list) { + active_element = new_list(root, spaces_trimmed, ol_list, ol_start); + } + NEW_ACTIVE_CHILD(active_element, active_element, -1, t_li, true); + root_on_newline = true; + list_waiting = false; + ol_list = false; } - } - memcpy(line_buffer, format_line_buffer, LINE_MAX-1); - line_length = strnlen(format_line_buffer, LINE_MAX); + // Trim spaces from newline + if(active_element == root || newline) { + //@todo with this we should relatively easily be able to check for code blocks! + spaces_trimmed++; + break; + } + soft_newline_count++; + break; + // Default character handling + default2: + default: + DEFAULT_CHECKS; + newline = false; + list_waiting = false; + ol_list = false; + append_char_to_active(root, &active_element, *cur_char); + break; } + } + // Convert the tree to valid html + depth++; + char* html = tree_to_html(root); + //printf("%s\n", html ? html : "" ); + // Tear down the tree + free_tree(root); + if(html) { + strncpy(buffer, html, buffer_size - 1); + } + free(html); + printf("Time to process in ns: %ld\n", (clock() - before) / (CLOCKS_PER_SEC / 1000000)); + depth--; + return 0; +} - // Prepend the pre_line_buffer to the line_buffer - line_length = prepend(line_buffer, pre_line_buffer); - - if(offset + line_length > buffer_size) { - jb_log(LL_WARN, false, "too long"); - return -1; - } +// Returns location of the closing round bracket if found, otherwise it returns a NULL +// out_text and out_loc are allocated by this function +// Don't forget to free +char* get_link_components(char* start, char** out_text, char** out_loc, size_t* out_len) { + // look how far the next newline is away + size_t line_length = strcspn(start, "\r\n"); - memcpy(buffer + offset, line_buffer, line_length); - offset += line_length; - if(on_lastline) { - *(buffer+offset) = '\0'; - if(in_paragraph) { - strncat(buffer, "</p>", buffer_size); - } - break; - } + // Search for the next closing bracket + char* closing_sqr_bracket = memchr(start, ']', line_length); + if(closing_sqr_bracket == NULL) { + return NULL; } - return 0; + if(*(closing_sqr_bracket + 1) != '(') { + return NULL; + } + char* closing_rnd_bracket = memchr(closing_sqr_bracket + 1, ')', line_length - (closing_sqr_bracket - start)); + if(closing_rnd_bracket == NULL) { + return NULL; + } + size_t link_text_len = closing_sqr_bracket - start - 1; + size_t link_loc_len = closing_rnd_bracket - closing_sqr_bracket - 2; + + char* link_text = calloc(link_text_len + 1, 1); + if(link_text == NULL) { + jb_log(LL_ERR, true, "calloc error"); + return NULL; + } + char* link_loc = calloc(link_loc_len + 1, 1); + if(link_text == NULL) { + jb_log(LL_ERR, true, "calloc error"); + free(link_text); + return NULL; + } + + + memcpy(link_text, start + 1, link_text_len); + memcpy(link_loc, closing_sqr_bracket + 2, link_loc_len); + *out_text = link_text; + *out_loc = link_loc; + *out_len = link_text_len + link_loc_len; + + return closing_rnd_bracket; } @@ -17,5 +17,6 @@ #ifndef MARKDOWN #define MARKDOWN +#include <stddef.h> int parse_markdown(char* input, char* buffer, size_t buffer_size); #endif |