inlines.c 41 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384
  1. #include <stdlib.h>
  2. #include <string.h>
  3. #include <stdio.h>
  4. #include "cmark_ctype.h"
  5. #include "config.h"
  6. #include "node.h"
  7. #include "parser.h"
  8. #include "references.h"
  9. #include "cmark.h"
  10. #include "houdini.h"
  11. #include "utf8.h"
  12. #include "scanners.h"
  13. #include "inlines.h"
  14. static const char *EMDASH = "\xE2\x80\x94";
  15. static const char *ENDASH = "\xE2\x80\x93";
  16. static const char *ELLIPSES = "\xE2\x80\xA6";
  17. static const char *LEFTDOUBLEQUOTE = "\xE2\x80\x9C";
  18. static const char *RIGHTDOUBLEQUOTE = "\xE2\x80\x9D";
  19. static const char *LEFTSINGLEQUOTE = "\xE2\x80\x98";
  20. static const char *RIGHTSINGLEQUOTE = "\xE2\x80\x99";
  21. // Macros for creating various kinds of simple.
  22. #define make_str(subj, sc, ec, s) make_literal(subj, CMARK_NODE_TEXT, sc, ec, s)
  23. #define make_code(subj, sc, ec, s) make_literal(subj, CMARK_NODE_CODE, sc, ec, s)
  24. #define make_raw_html(subj, sc, ec, s) make_literal(subj, CMARK_NODE_HTML_INLINE, sc, ec, s)
  25. #define make_linebreak(mem) make_simple(mem, CMARK_NODE_LINEBREAK)
  26. #define make_softbreak(mem) make_simple(mem, CMARK_NODE_SOFTBREAK)
  27. #define make_emph(mem) make_simple(mem, CMARK_NODE_EMPH)
  28. #define make_strong(mem) make_simple(mem, CMARK_NODE_STRONG)
  29. #define MAXBACKTICKS 1000
  30. typedef struct delimiter {
  31. struct delimiter *previous;
  32. struct delimiter *next;
  33. cmark_node *inl_text;
  34. bufsize_t length;
  35. unsigned char delim_char;
  36. bool can_open;
  37. bool can_close;
  38. } delimiter;
  39. typedef struct bracket {
  40. struct bracket *previous;
  41. struct delimiter *previous_delimiter;
  42. cmark_node *inl_text;
  43. bufsize_t position;
  44. bool image;
  45. bool active;
  46. bool bracket_after;
  47. } bracket;
  48. typedef struct {
  49. cmark_mem *mem;
  50. cmark_chunk input;
  51. int line;
  52. bufsize_t pos;
  53. int block_offset;
  54. int column_offset;
  55. cmark_reference_map *refmap;
  56. delimiter *last_delim;
  57. bracket *last_bracket;
  58. bufsize_t backticks[MAXBACKTICKS + 1];
  59. bool scanned_for_backticks;
  60. } subject;
  61. static CMARK_INLINE bool S_is_line_end_char(char c) {
  62. return (c == '\n' || c == '\r');
  63. }
  64. static delimiter *S_insert_emph(subject *subj, delimiter *opener,
  65. delimiter *closer);
  66. static int parse_inline(subject *subj, cmark_node *parent, int options);
  67. static void subject_from_buf(cmark_mem *mem, int line_number, int block_offset, subject *e,
  68. cmark_chunk *chunk, cmark_reference_map *refmap);
  69. static bufsize_t subject_find_special_char(subject *subj, int options);
  70. // Create an inline with a literal string value.
  71. static CMARK_INLINE cmark_node *make_literal(subject *subj, cmark_node_type t,
  72. int start_column, int end_column,
  73. cmark_chunk s) {
  74. cmark_node *e = (cmark_node *)subj->mem->calloc(1, sizeof(*e));
  75. cmark_strbuf_init(subj->mem, &e->content, 0);
  76. e->type = (uint16_t)t;
  77. e->as.literal = s;
  78. e->start_line = e->end_line = subj->line;
  79. // columns are 1 based.
  80. e->start_column = start_column + 1 + subj->column_offset + subj->block_offset;
  81. e->end_column = end_column + 1 + subj->column_offset + subj->block_offset;
  82. return e;
  83. }
  84. // Create an inline with no value.
  85. static CMARK_INLINE cmark_node *make_simple(cmark_mem *mem, cmark_node_type t) {
  86. cmark_node *e = (cmark_node *)mem->calloc(1, sizeof(*e));
  87. cmark_strbuf_init(mem, &e->content, 0);
  88. e->type = t;
  89. return e;
  90. }
  91. // Like make_str, but parses entities.
  92. static cmark_node *make_str_with_entities(subject *subj,
  93. int start_column, int end_column,
  94. cmark_chunk *content) {
  95. cmark_strbuf unescaped = CMARK_BUF_INIT(subj->mem);
  96. if (houdini_unescape_html(&unescaped, content->data, content->len)) {
  97. return make_str(subj, start_column, end_column, cmark_chunk_buf_detach(&unescaped));
  98. } else {
  99. return make_str(subj, start_column, end_column, *content);
  100. }
  101. }
  102. // Duplicate a chunk by creating a copy of the buffer not by reusing the
  103. // buffer like cmark_chunk_dup does.
  104. static cmark_chunk chunk_clone(cmark_mem *mem, cmark_chunk *src) {
  105. cmark_chunk c;
  106. bufsize_t len = src->len;
  107. c.len = len;
  108. c.data = (unsigned char *)mem->calloc(len + 1, 1);
  109. c.alloc = 1;
  110. if (len)
  111. memcpy(c.data, src->data, len);
  112. c.data[len] = '\0';
  113. return c;
  114. }
  115. static cmark_chunk cmark_clean_autolink(cmark_mem *mem, cmark_chunk *url,
  116. int is_email) {
  117. cmark_strbuf buf = CMARK_BUF_INIT(mem);
  118. cmark_chunk_trim(url);
  119. if (url->len == 0) {
  120. cmark_chunk result = CMARK_CHUNK_EMPTY;
  121. return result;
  122. }
  123. if (is_email)
  124. cmark_strbuf_puts(&buf, "mailto:");
  125. houdini_unescape_html_f(&buf, url->data, url->len);
  126. return cmark_chunk_buf_detach(&buf);
  127. }
  128. static CMARK_INLINE cmark_node *make_autolink(subject *subj,
  129. int start_column, int end_column,
  130. cmark_chunk url, int is_email) {
  131. cmark_node *link = make_simple(subj->mem, CMARK_NODE_LINK);
  132. link->as.link.url = cmark_clean_autolink(subj->mem, &url, is_email);
  133. link->as.link.title = cmark_chunk_literal("");
  134. link->start_line = link->end_line = subj->line;
  135. link->start_column = start_column + 1;
  136. link->end_column = end_column + 1;
  137. cmark_node_append_child(link, make_str_with_entities(subj, start_column + 1, end_column - 1, &url));
  138. return link;
  139. }
  140. static void subject_from_buf(cmark_mem *mem, int line_number, int block_offset, subject *e,
  141. cmark_chunk *chunk, cmark_reference_map *refmap) {
  142. int i;
  143. e->mem = mem;
  144. e->input = *chunk;
  145. e->line = line_number;
  146. e->pos = 0;
  147. e->block_offset = block_offset;
  148. e->column_offset = 0;
  149. e->refmap = refmap;
  150. e->last_delim = NULL;
  151. e->last_bracket = NULL;
  152. for (i = 0; i <= MAXBACKTICKS; i++) {
  153. e->backticks[i] = 0;
  154. }
  155. e->scanned_for_backticks = false;
  156. }
  157. static CMARK_INLINE int isbacktick(int c) { return (c == '`'); }
  158. static CMARK_INLINE unsigned char peek_char(subject *subj) {
  159. // NULL bytes should have been stripped out by now. If they're
  160. // present, it's a programming error:
  161. assert(!(subj->pos < subj->input.len && subj->input.data[subj->pos] == 0));
  162. return (subj->pos < subj->input.len) ? subj->input.data[subj->pos] : 0;
  163. }
  164. static CMARK_INLINE unsigned char peek_at(subject *subj, bufsize_t pos) {
  165. return subj->input.data[pos];
  166. }
  167. // Return true if there are more characters in the subject.
  168. static CMARK_INLINE int is_eof(subject *subj) {
  169. return (subj->pos >= subj->input.len);
  170. }
  171. // Advance the subject. Doesn't check for eof.
  172. #define advance(subj) (subj)->pos += 1
  173. static CMARK_INLINE bool skip_spaces(subject *subj) {
  174. bool skipped = false;
  175. while (peek_char(subj) == ' ' || peek_char(subj) == '\t') {
  176. advance(subj);
  177. skipped = true;
  178. }
  179. return skipped;
  180. }
  181. static CMARK_INLINE bool skip_line_end(subject *subj) {
  182. bool seen_line_end_char = false;
  183. if (peek_char(subj) == '\r') {
  184. advance(subj);
  185. seen_line_end_char = true;
  186. }
  187. if (peek_char(subj) == '\n') {
  188. advance(subj);
  189. seen_line_end_char = true;
  190. }
  191. return seen_line_end_char || is_eof(subj);
  192. }
  193. // Take characters while a predicate holds, and return a string.
  194. static CMARK_INLINE cmark_chunk take_while(subject *subj, int (*f)(int)) {
  195. unsigned char c;
  196. bufsize_t startpos = subj->pos;
  197. bufsize_t len = 0;
  198. while ((c = peek_char(subj)) && (*f)(c)) {
  199. advance(subj);
  200. len++;
  201. }
  202. return cmark_chunk_dup(&subj->input, startpos, len);
  203. }
  204. // Return the number of newlines in a given span of text in a subject. If
  205. // the number is greater than zero, also return the number of characters
  206. // between the last newline and the end of the span in `since_newline`.
  207. static int count_newlines(subject *subj, bufsize_t from, bufsize_t len, int *since_newline) {
  208. int nls = 0;
  209. int since_nl = 0;
  210. while (len--) {
  211. if (subj->input.data[from++] == '\n') {
  212. ++nls;
  213. since_nl = 0;
  214. } else {
  215. ++since_nl;
  216. }
  217. }
  218. if (!nls)
  219. return 0;
  220. *since_newline = since_nl;
  221. return nls;
  222. }
  223. // Adjust `node`'s `end_line`, `end_column`, and `subj`'s `line` and
  224. // `column_offset` according to the number of newlines in a just-matched span
  225. // of text in `subj`.
  226. static void adjust_subj_node_newlines(subject *subj, cmark_node *node, int matchlen, int extra, int options) {
  227. if (!(options & CMARK_OPT_SOURCEPOS)) {
  228. return;
  229. }
  230. int since_newline;
  231. int newlines = count_newlines(subj, subj->pos - matchlen - extra, matchlen, &since_newline);
  232. if (newlines) {
  233. subj->line += newlines;
  234. node->end_line += newlines;
  235. node->end_column = since_newline;
  236. subj->column_offset = -subj->pos + since_newline + extra;
  237. }
  238. }
  239. // Try to process a backtick code span that began with a
  240. // span of ticks of length openticklength length (already
  241. // parsed). Return 0 if you don't find matching closing
  242. // backticks, otherwise return the position in the subject
  243. // after the closing backticks.
  244. static bufsize_t scan_to_closing_backticks(subject *subj,
  245. bufsize_t openticklength) {
  246. bool found = false;
  247. if (openticklength > MAXBACKTICKS) {
  248. // we limit backtick string length because of the array subj->backticks:
  249. return 0;
  250. }
  251. if (subj->scanned_for_backticks &&
  252. subj->backticks[openticklength] <= subj->pos) {
  253. // return if we already know there's no closer
  254. return 0;
  255. }
  256. while (!found) {
  257. // read non backticks
  258. unsigned char c;
  259. while ((c = peek_char(subj)) && c != '`') {
  260. advance(subj);
  261. }
  262. if (is_eof(subj)) {
  263. break;
  264. }
  265. bufsize_t numticks = 0;
  266. while (peek_char(subj) == '`') {
  267. advance(subj);
  268. numticks++;
  269. }
  270. // store position of ender
  271. if (numticks <= MAXBACKTICKS) {
  272. subj->backticks[numticks] = subj->pos - numticks;
  273. }
  274. if (numticks == openticklength) {
  275. return (subj->pos);
  276. }
  277. }
  278. // got through whole input without finding closer
  279. subj->scanned_for_backticks = true;
  280. return 0;
  281. }
  282. // Destructively modify string, converting newlines to
  283. // spaces, then removing a single leading + trailing space,
  284. // unless the code span consists entirely of space characters.
  285. static void S_normalize_code(cmark_strbuf *s) {
  286. bufsize_t r, w;
  287. bool contains_nonspace = false;
  288. for (r = 0, w = 0; r < s->size; ++r) {
  289. switch (s->ptr[r]) {
  290. case '\r':
  291. if (s->ptr[r + 1] != '\n') {
  292. s->ptr[w++] = ' ';
  293. }
  294. break;
  295. case '\n':
  296. s->ptr[w++] = ' ';
  297. break;
  298. default:
  299. s->ptr[w++] = s->ptr[r];
  300. }
  301. if (s->ptr[r] != ' ') {
  302. contains_nonspace = true;
  303. }
  304. }
  305. // begins and ends with space?
  306. if (contains_nonspace &&
  307. s->ptr[0] == ' ' && s->ptr[w - 1] == ' ') {
  308. cmark_strbuf_drop(s, 1);
  309. cmark_strbuf_truncate(s, w - 2);
  310. } else {
  311. cmark_strbuf_truncate(s, w);
  312. }
  313. }
  314. // Parse backtick code section or raw backticks, return an inline.
  315. // Assumes that the subject has a backtick at the current position.
  316. static cmark_node *handle_backticks(subject *subj, int options) {
  317. cmark_chunk openticks = take_while(subj, isbacktick);
  318. bufsize_t startpos = subj->pos;
  319. bufsize_t endpos = scan_to_closing_backticks(subj, openticks.len);
  320. if (endpos == 0) { // not found
  321. subj->pos = startpos; // rewind
  322. return make_str(subj, subj->pos, subj->pos, openticks);
  323. } else {
  324. cmark_strbuf buf = CMARK_BUF_INIT(subj->mem);
  325. cmark_strbuf_set(&buf, subj->input.data + startpos,
  326. endpos - startpos - openticks.len);
  327. S_normalize_code(&buf);
  328. cmark_node *node = make_code(subj, startpos, endpos - openticks.len - 1, cmark_chunk_buf_detach(&buf));
  329. adjust_subj_node_newlines(subj, node, endpos - startpos, openticks.len, options);
  330. return node;
  331. }
  332. }
  333. // Scan ***, **, or * and return number scanned, or 0.
  334. // Advances position.
  335. static int scan_delims(subject *subj, unsigned char c, bool *can_open,
  336. bool *can_close) {
  337. int numdelims = 0;
  338. bufsize_t before_char_pos;
  339. int32_t after_char = 0;
  340. int32_t before_char = 0;
  341. int len;
  342. bool left_flanking, right_flanking;
  343. if (subj->pos == 0) {
  344. before_char = 10;
  345. } else {
  346. before_char_pos = subj->pos - 1;
  347. // walk back to the beginning of the UTF_8 sequence:
  348. while (peek_at(subj, before_char_pos) >> 6 == 2 && before_char_pos > 0) {
  349. before_char_pos -= 1;
  350. }
  351. len = cmark_utf8proc_iterate(subj->input.data + before_char_pos,
  352. subj->pos - before_char_pos, &before_char);
  353. if (len == -1) {
  354. before_char = 10;
  355. }
  356. }
  357. if (c == '\'' || c == '"') {
  358. numdelims++;
  359. advance(subj); // limit to 1 delim for quotes
  360. } else {
  361. while (peek_char(subj) == c) {
  362. numdelims++;
  363. advance(subj);
  364. }
  365. }
  366. len = cmark_utf8proc_iterate(subj->input.data + subj->pos,
  367. subj->input.len - subj->pos, &after_char);
  368. if (len == -1) {
  369. after_char = 10;
  370. }
  371. left_flanking = numdelims > 0 && !cmark_utf8proc_is_space(after_char) &&
  372. (!cmark_utf8proc_is_punctuation(after_char) ||
  373. cmark_utf8proc_is_space(before_char) ||
  374. cmark_utf8proc_is_punctuation(before_char));
  375. right_flanking = numdelims > 0 && !cmark_utf8proc_is_space(before_char) &&
  376. (!cmark_utf8proc_is_punctuation(before_char) ||
  377. cmark_utf8proc_is_space(after_char) ||
  378. cmark_utf8proc_is_punctuation(after_char));
  379. if (c == '_') {
  380. *can_open = left_flanking &&
  381. (!right_flanking || cmark_utf8proc_is_punctuation(before_char));
  382. *can_close = right_flanking &&
  383. (!left_flanking || cmark_utf8proc_is_punctuation(after_char));
  384. } else if (c == '\'' || c == '"') {
  385. *can_open = left_flanking && !right_flanking &&
  386. before_char != ']' && before_char != ')';
  387. *can_close = right_flanking;
  388. } else {
  389. *can_open = left_flanking;
  390. *can_close = right_flanking;
  391. }
  392. return numdelims;
  393. }
  394. /*
  395. static void print_delimiters(subject *subj)
  396. {
  397. delimiter *delim;
  398. delim = subj->last_delim;
  399. while (delim != NULL) {
  400. printf("Item at stack pos %p: %d %d %d next(%p) prev(%p)\n",
  401. (void*)delim, delim->delim_char,
  402. delim->can_open, delim->can_close,
  403. (void*)delim->next, (void*)delim->previous);
  404. delim = delim->previous;
  405. }
  406. }
  407. */
  408. static void remove_delimiter(subject *subj, delimiter *delim) {
  409. if (delim == NULL)
  410. return;
  411. if (delim->next == NULL) {
  412. // end of list:
  413. assert(delim == subj->last_delim);
  414. subj->last_delim = delim->previous;
  415. } else {
  416. delim->next->previous = delim->previous;
  417. }
  418. if (delim->previous != NULL) {
  419. delim->previous->next = delim->next;
  420. }
  421. subj->mem->free(delim);
  422. }
  423. static void pop_bracket(subject *subj) {
  424. bracket *b;
  425. if (subj->last_bracket == NULL)
  426. return;
  427. b = subj->last_bracket;
  428. subj->last_bracket = subj->last_bracket->previous;
  429. subj->mem->free(b);
  430. }
  431. static void push_delimiter(subject *subj, unsigned char c, bool can_open,
  432. bool can_close, cmark_node *inl_text) {
  433. delimiter *delim = (delimiter *)subj->mem->calloc(1, sizeof(delimiter));
  434. delim->delim_char = c;
  435. delim->can_open = can_open;
  436. delim->can_close = can_close;
  437. delim->inl_text = inl_text;
  438. delim->length = inl_text->as.literal.len;
  439. delim->previous = subj->last_delim;
  440. delim->next = NULL;
  441. if (delim->previous != NULL) {
  442. delim->previous->next = delim;
  443. }
  444. subj->last_delim = delim;
  445. }
  446. static void push_bracket(subject *subj, bool image, cmark_node *inl_text) {
  447. bracket *b = (bracket *)subj->mem->calloc(1, sizeof(bracket));
  448. if (subj->last_bracket != NULL) {
  449. subj->last_bracket->bracket_after = true;
  450. }
  451. b->image = image;
  452. b->active = true;
  453. b->inl_text = inl_text;
  454. b->previous = subj->last_bracket;
  455. b->previous_delimiter = subj->last_delim;
  456. b->position = subj->pos;
  457. b->bracket_after = false;
  458. subj->last_bracket = b;
  459. }
  460. // Assumes the subject has a c at the current position.
  461. static cmark_node *handle_delim(subject *subj, unsigned char c, bool smart) {
  462. bufsize_t numdelims;
  463. cmark_node *inl_text;
  464. bool can_open, can_close;
  465. cmark_chunk contents;
  466. numdelims = scan_delims(subj, c, &can_open, &can_close);
  467. if (c == '\'' && smart) {
  468. contents = cmark_chunk_literal(RIGHTSINGLEQUOTE);
  469. } else if (c == '"' && smart) {
  470. contents =
  471. cmark_chunk_literal(can_close ? RIGHTDOUBLEQUOTE : LEFTDOUBLEQUOTE);
  472. } else {
  473. contents = cmark_chunk_dup(&subj->input, subj->pos - numdelims, numdelims);
  474. }
  475. inl_text = make_str(subj, subj->pos - numdelims, subj->pos - 1, contents);
  476. if ((can_open || can_close) && (!(c == '\'' || c == '"') || smart)) {
  477. push_delimiter(subj, c, can_open, can_close, inl_text);
  478. }
  479. return inl_text;
  480. }
  481. // Assumes we have a hyphen at the current position.
  482. static cmark_node *handle_hyphen(subject *subj, bool smart) {
  483. int startpos = subj->pos;
  484. advance(subj);
  485. if (!smart || peek_char(subj) != '-') {
  486. return make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal("-"));
  487. }
  488. while (smart && peek_char(subj) == '-') {
  489. advance(subj);
  490. }
  491. int numhyphens = subj->pos - startpos;
  492. int en_count = 0;
  493. int em_count = 0;
  494. int i;
  495. cmark_strbuf buf = CMARK_BUF_INIT(subj->mem);
  496. if (numhyphens % 3 == 0) { // if divisible by 3, use all em dashes
  497. em_count = numhyphens / 3;
  498. } else if (numhyphens % 2 == 0) { // if divisible by 2, use all en dashes
  499. en_count = numhyphens / 2;
  500. } else if (numhyphens % 3 == 2) { // use one en dash at end
  501. en_count = 1;
  502. em_count = (numhyphens - 2) / 3;
  503. } else { // use two en dashes at the end
  504. en_count = 2;
  505. em_count = (numhyphens - 4) / 3;
  506. }
  507. for (i = em_count; i > 0; i--) {
  508. cmark_strbuf_puts(&buf, EMDASH);
  509. }
  510. for (i = en_count; i > 0; i--) {
  511. cmark_strbuf_puts(&buf, ENDASH);
  512. }
  513. return make_str(subj, startpos, subj->pos - 1, cmark_chunk_buf_detach(&buf));
  514. }
  515. // Assumes we have a period at the current position.
  516. static cmark_node *handle_period(subject *subj, bool smart) {
  517. advance(subj);
  518. if (smart && peek_char(subj) == '.') {
  519. advance(subj);
  520. if (peek_char(subj) == '.') {
  521. advance(subj);
  522. return make_str(subj, subj->pos - 3, subj->pos - 1, cmark_chunk_literal(ELLIPSES));
  523. } else {
  524. return make_str(subj, subj->pos - 2, subj->pos - 1, cmark_chunk_literal(".."));
  525. }
  526. } else {
  527. return make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal("."));
  528. }
  529. }
  530. static void process_emphasis(subject *subj, delimiter *stack_bottom) {
  531. delimiter *closer = subj->last_delim;
  532. delimiter *opener;
  533. delimiter *old_closer;
  534. bool opener_found;
  535. int openers_bottom_index = 0;
  536. delimiter *openers_bottom[6] = {stack_bottom, stack_bottom, stack_bottom,
  537. stack_bottom, stack_bottom, stack_bottom};
  538. // move back to first relevant delim.
  539. while (closer != NULL && closer->previous != stack_bottom) {
  540. closer = closer->previous;
  541. }
  542. // now move forward, looking for closers, and handling each
  543. while (closer != NULL) {
  544. if (closer->can_close) {
  545. switch (closer->delim_char) {
  546. case '"':
  547. openers_bottom_index = 0;
  548. break;
  549. case '\'':
  550. openers_bottom_index = 1;
  551. break;
  552. case '_':
  553. openers_bottom_index = 2;
  554. break;
  555. case '*':
  556. openers_bottom_index = 3 + (closer->length % 3);
  557. break;
  558. default:
  559. assert(false);
  560. }
  561. // Now look backwards for first matching opener:
  562. opener = closer->previous;
  563. opener_found = false;
  564. while (opener != NULL && opener != openers_bottom[openers_bottom_index]) {
  565. if (opener->can_open && opener->delim_char == closer->delim_char) {
  566. // interior closer of size 2 can't match opener of size 1
  567. // or of size 1 can't match 2
  568. if (!(closer->can_open || opener->can_close) ||
  569. closer->length % 3 == 0 ||
  570. (opener->length + closer->length) % 3 != 0) {
  571. opener_found = true;
  572. break;
  573. }
  574. }
  575. opener = opener->previous;
  576. }
  577. old_closer = closer;
  578. if (closer->delim_char == '*' || closer->delim_char == '_') {
  579. if (opener_found) {
  580. closer = S_insert_emph(subj, opener, closer);
  581. } else {
  582. closer = closer->next;
  583. }
  584. } else if (closer->delim_char == '\'') {
  585. cmark_chunk_free(subj->mem, &closer->inl_text->as.literal);
  586. closer->inl_text->as.literal = cmark_chunk_literal(RIGHTSINGLEQUOTE);
  587. if (opener_found) {
  588. cmark_chunk_free(subj->mem, &opener->inl_text->as.literal);
  589. opener->inl_text->as.literal = cmark_chunk_literal(LEFTSINGLEQUOTE);
  590. }
  591. closer = closer->next;
  592. } else if (closer->delim_char == '"') {
  593. cmark_chunk_free(subj->mem, &closer->inl_text->as.literal);
  594. closer->inl_text->as.literal = cmark_chunk_literal(RIGHTDOUBLEQUOTE);
  595. if (opener_found) {
  596. cmark_chunk_free(subj->mem, &opener->inl_text->as.literal);
  597. opener->inl_text->as.literal = cmark_chunk_literal(LEFTDOUBLEQUOTE);
  598. }
  599. closer = closer->next;
  600. }
  601. if (!opener_found) {
  602. // set lower bound for future searches for openers
  603. openers_bottom[openers_bottom_index] = old_closer->previous;
  604. if (!old_closer->can_open) {
  605. // we can remove a closer that can't be an
  606. // opener, once we've seen there's no
  607. // matching opener:
  608. remove_delimiter(subj, old_closer);
  609. }
  610. }
  611. } else {
  612. closer = closer->next;
  613. }
  614. }
  615. // free all delimiters in list until stack_bottom:
  616. while (subj->last_delim != NULL && subj->last_delim != stack_bottom) {
  617. remove_delimiter(subj, subj->last_delim);
  618. }
  619. }
  620. static delimiter *S_insert_emph(subject *subj, delimiter *opener,
  621. delimiter *closer) {
  622. delimiter *delim, *tmp_delim;
  623. bufsize_t use_delims;
  624. cmark_node *opener_inl = opener->inl_text;
  625. cmark_node *closer_inl = closer->inl_text;
  626. bufsize_t opener_num_chars = opener_inl->as.literal.len;
  627. bufsize_t closer_num_chars = closer_inl->as.literal.len;
  628. cmark_node *tmp, *tmpnext, *emph;
  629. // calculate the actual number of characters used from this closer
  630. use_delims = (closer_num_chars >= 2 && opener_num_chars >= 2) ? 2 : 1;
  631. // remove used characters from associated inlines.
  632. opener_num_chars -= use_delims;
  633. closer_num_chars -= use_delims;
  634. opener_inl->as.literal.len = opener_num_chars;
  635. closer_inl->as.literal.len = closer_num_chars;
  636. // free delimiters between opener and closer
  637. delim = closer->previous;
  638. while (delim != NULL && delim != opener) {
  639. tmp_delim = delim->previous;
  640. remove_delimiter(subj, delim);
  641. delim = tmp_delim;
  642. }
  643. // create new emph or strong, and splice it in to our inlines
  644. // between the opener and closer
  645. emph = use_delims == 1 ? make_emph(subj->mem) : make_strong(subj->mem);
  646. tmp = opener_inl->next;
  647. while (tmp && tmp != closer_inl) {
  648. tmpnext = tmp->next;
  649. cmark_node_append_child(emph, tmp);
  650. tmp = tmpnext;
  651. }
  652. cmark_node_insert_after(opener_inl, emph);
  653. emph->start_line = opener_inl->start_line;
  654. emph->end_line = closer_inl->end_line;
  655. emph->start_column = opener_inl->start_column;
  656. emph->end_column = closer_inl->end_column;
  657. // if opener has 0 characters, remove it and its associated inline
  658. if (opener_num_chars == 0) {
  659. cmark_node_free(opener_inl);
  660. remove_delimiter(subj, opener);
  661. }
  662. // if closer has 0 characters, remove it and its associated inline
  663. if (closer_num_chars == 0) {
  664. // remove empty closer inline
  665. cmark_node_free(closer_inl);
  666. // remove closer from list
  667. tmp_delim = closer->next;
  668. remove_delimiter(subj, closer);
  669. closer = tmp_delim;
  670. }
  671. return closer;
  672. }
  673. // Parse backslash-escape or just a backslash, returning an inline.
  674. static cmark_node *handle_backslash(subject *subj) {
  675. advance(subj);
  676. unsigned char nextchar = peek_char(subj);
  677. if (cmark_ispunct(
  678. nextchar)) { // only ascii symbols and newline can be escaped
  679. advance(subj);
  680. return make_str(subj, subj->pos - 2, subj->pos - 1, cmark_chunk_dup(&subj->input, subj->pos - 1, 1));
  681. } else if (!is_eof(subj) && skip_line_end(subj)) {
  682. return make_linebreak(subj->mem);
  683. } else {
  684. return make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal("\\"));
  685. }
  686. }
  687. // Parse an entity or a regular "&" string.
  688. // Assumes the subject has an '&' character at the current position.
  689. static cmark_node *handle_entity(subject *subj) {
  690. cmark_strbuf ent = CMARK_BUF_INIT(subj->mem);
  691. bufsize_t len;
  692. advance(subj);
  693. len = houdini_unescape_ent(&ent, subj->input.data + subj->pos,
  694. subj->input.len - subj->pos);
  695. if (len == 0)
  696. return make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal("&"));
  697. subj->pos += len;
  698. return make_str(subj, subj->pos - 1 - len, subj->pos - 1, cmark_chunk_buf_detach(&ent));
  699. }
  700. // Clean a URL: remove surrounding whitespace, and remove \ that escape
  701. // punctuation.
  702. cmark_chunk cmark_clean_url(cmark_mem *mem, cmark_chunk *url) {
  703. cmark_strbuf buf = CMARK_BUF_INIT(mem);
  704. cmark_chunk_trim(url);
  705. if (url->len == 0) {
  706. cmark_chunk result = CMARK_CHUNK_EMPTY;
  707. return result;
  708. }
  709. houdini_unescape_html_f(&buf, url->data, url->len);
  710. cmark_strbuf_unescape(&buf);
  711. return cmark_chunk_buf_detach(&buf);
  712. }
  713. cmark_chunk cmark_clean_title(cmark_mem *mem, cmark_chunk *title) {
  714. cmark_strbuf buf = CMARK_BUF_INIT(mem);
  715. unsigned char first, last;
  716. if (title->len == 0) {
  717. cmark_chunk result = CMARK_CHUNK_EMPTY;
  718. return result;
  719. }
  720. first = title->data[0];
  721. last = title->data[title->len - 1];
  722. // remove surrounding quotes if any:
  723. if ((first == '\'' && last == '\'') || (first == '(' && last == ')') ||
  724. (first == '"' && last == '"')) {
  725. houdini_unescape_html_f(&buf, title->data + 1, title->len - 2);
  726. } else {
  727. houdini_unescape_html_f(&buf, title->data, title->len);
  728. }
  729. cmark_strbuf_unescape(&buf);
  730. return cmark_chunk_buf_detach(&buf);
  731. }
  732. // Parse an autolink or HTML tag.
  733. // Assumes the subject has a '<' character at the current position.
  734. static cmark_node *handle_pointy_brace(subject *subj, int options) {
  735. bufsize_t matchlen = 0;
  736. cmark_chunk contents;
  737. advance(subj); // advance past first <
  738. // first try to match a URL autolink
  739. matchlen = scan_autolink_uri(&subj->input, subj->pos);
  740. if (matchlen > 0) {
  741. contents = cmark_chunk_dup(&subj->input, subj->pos, matchlen - 1);
  742. subj->pos += matchlen;
  743. return make_autolink(subj, subj->pos - 1 - matchlen, subj->pos - 1, contents, 0);
  744. }
  745. // next try to match an email autolink
  746. matchlen = scan_autolink_email(&subj->input, subj->pos);
  747. if (matchlen > 0) {
  748. contents = cmark_chunk_dup(&subj->input, subj->pos, matchlen - 1);
  749. subj->pos += matchlen;
  750. return make_autolink(subj, subj->pos - 1 - matchlen, subj->pos - 1, contents, 1);
  751. }
  752. // finally, try to match an html tag
  753. matchlen = scan_html_tag(&subj->input, subj->pos);
  754. if (matchlen > 0) {
  755. contents = cmark_chunk_dup(&subj->input, subj->pos - 1, matchlen + 1);
  756. subj->pos += matchlen;
  757. cmark_node *node = make_raw_html(subj, subj->pos - matchlen - 1, subj->pos - 1, contents);
  758. adjust_subj_node_newlines(subj, node, matchlen, 1, options);
  759. return node;
  760. }
  761. // if nothing matches, just return the opening <:
  762. return make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal("<"));
  763. }
  764. // Parse a link label. Returns 1 if successful.
  765. // Note: unescaped brackets are not allowed in labels.
  766. // The label begins with `[` and ends with the first `]` character
  767. // encountered. Backticks in labels do not start code spans.
  768. static int link_label(subject *subj, cmark_chunk *raw_label) {
  769. bufsize_t startpos = subj->pos;
  770. int length = 0;
  771. unsigned char c;
  772. // advance past [
  773. if (peek_char(subj) == '[') {
  774. advance(subj);
  775. } else {
  776. return 0;
  777. }
  778. while ((c = peek_char(subj)) && c != '[' && c != ']') {
  779. if (c == '\\') {
  780. advance(subj);
  781. length++;
  782. if (cmark_ispunct(peek_char(subj))) {
  783. advance(subj);
  784. length++;
  785. }
  786. } else {
  787. advance(subj);
  788. length++;
  789. }
  790. if (length > MAX_LINK_LABEL_LENGTH) {
  791. goto noMatch;
  792. }
  793. }
  794. if (c == ']') { // match found
  795. *raw_label =
  796. cmark_chunk_dup(&subj->input, startpos + 1, subj->pos - (startpos + 1));
  797. cmark_chunk_trim(raw_label);
  798. advance(subj); // advance past ]
  799. return 1;
  800. }
  801. noMatch:
  802. subj->pos = startpos; // rewind
  803. return 0;
  804. }
  805. static bufsize_t manual_scan_link_url_2(cmark_chunk *input, bufsize_t offset,
  806. cmark_chunk *output) {
  807. bufsize_t i = offset;
  808. size_t nb_p = 0;
  809. while (i < input->len) {
  810. if (input->data[i] == '\\' &&
  811. i + 1 < input-> len &&
  812. cmark_ispunct(input->data[i+1]))
  813. i += 2;
  814. else if (input->data[i] == '(') {
  815. ++nb_p;
  816. ++i;
  817. if (nb_p > 32)
  818. return -1;
  819. } else if (input->data[i] == ')') {
  820. if (nb_p == 0)
  821. break;
  822. --nb_p;
  823. ++i;
  824. } else if (cmark_isspace(input->data[i])) {
  825. if (i == offset) {
  826. return -1;
  827. }
  828. break;
  829. } else {
  830. ++i;
  831. }
  832. }
  833. if (i >= input->len)
  834. return -1;
  835. {
  836. cmark_chunk result = {input->data + offset, i - offset, 0};
  837. *output = result;
  838. }
  839. return i - offset;
  840. }
  841. static bufsize_t manual_scan_link_url(cmark_chunk *input, bufsize_t offset,
  842. cmark_chunk *output) {
  843. bufsize_t i = offset;
  844. if (i < input->len && input->data[i] == '<') {
  845. ++i;
  846. while (i < input->len) {
  847. if (input->data[i] == '>') {
  848. ++i;
  849. break;
  850. } else if (input->data[i] == '\\')
  851. i += 2;
  852. else if (input->data[i] == '\n' || input->data[i] == '<')
  853. return -1;
  854. else
  855. ++i;
  856. }
  857. } else {
  858. return manual_scan_link_url_2(input, offset, output);
  859. }
  860. if (i >= input->len)
  861. return -1;
  862. {
  863. cmark_chunk result = {input->data + offset + 1, i - 2 - offset, 0};
  864. *output = result;
  865. }
  866. return i - offset;
  867. }
  868. // Return a link, an image, or a literal close bracket.
  869. static cmark_node *handle_close_bracket(subject *subj) {
  870. bufsize_t initial_pos, after_link_text_pos;
  871. bufsize_t endurl, starttitle, endtitle, endall;
  872. bufsize_t sps, n;
  873. cmark_reference *ref = NULL;
  874. cmark_chunk url_chunk, title_chunk;
  875. cmark_chunk url, title;
  876. bracket *opener;
  877. cmark_node *inl;
  878. cmark_chunk raw_label;
  879. int found_label;
  880. cmark_node *tmp, *tmpnext;
  881. bool is_image;
  882. advance(subj); // advance past ]
  883. initial_pos = subj->pos;
  884. // get last [ or ![
  885. opener = subj->last_bracket;
  886. if (opener == NULL) {
  887. return make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal("]"));
  888. }
  889. if (!opener->active) {
  890. // take delimiter off stack
  891. pop_bracket(subj);
  892. return make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal("]"));
  893. }
  894. // If we got here, we matched a potential link/image text.
  895. // Now we check to see if it's a link/image.
  896. is_image = opener->image;
  897. after_link_text_pos = subj->pos;
  898. // First, look for an inline link.
  899. if (peek_char(subj) == '(' &&
  900. ((sps = scan_spacechars(&subj->input, subj->pos + 1)) > -1) &&
  901. ((n = manual_scan_link_url(&subj->input, subj->pos + 1 + sps,
  902. &url_chunk)) > -1)) {
  903. // try to parse an explicit link:
  904. endurl = subj->pos + 1 + sps + n;
  905. starttitle = endurl + scan_spacechars(&subj->input, endurl);
  906. // ensure there are spaces btw url and title
  907. endtitle = (starttitle == endurl)
  908. ? starttitle
  909. : starttitle + scan_link_title(&subj->input, starttitle);
  910. endall = endtitle + scan_spacechars(&subj->input, endtitle);
  911. if (peek_at(subj, endall) == ')') {
  912. subj->pos = endall + 1;
  913. title_chunk =
  914. cmark_chunk_dup(&subj->input, starttitle, endtitle - starttitle);
  915. url = cmark_clean_url(subj->mem, &url_chunk);
  916. title = cmark_clean_title(subj->mem, &title_chunk);
  917. cmark_chunk_free(subj->mem, &url_chunk);
  918. cmark_chunk_free(subj->mem, &title_chunk);
  919. goto match;
  920. } else {
  921. // it could still be a shortcut reference link
  922. subj->pos = after_link_text_pos;
  923. }
  924. }
  925. // Next, look for a following [link label] that matches in refmap.
  926. // skip spaces
  927. raw_label = cmark_chunk_literal("");
  928. found_label = link_label(subj, &raw_label);
  929. if (!found_label) {
  930. // If we have a shortcut reference link, back up
  931. // to before the spacse we skipped.
  932. subj->pos = initial_pos;
  933. }
  934. if ((!found_label || raw_label.len == 0) && !opener->bracket_after) {
  935. cmark_chunk_free(subj->mem, &raw_label);
  936. raw_label = cmark_chunk_dup(&subj->input, opener->position,
  937. initial_pos - opener->position - 1);
  938. found_label = true;
  939. }
  940. if (found_label) {
  941. ref = cmark_reference_lookup(subj->refmap, &raw_label);
  942. cmark_chunk_free(subj->mem, &raw_label);
  943. }
  944. if (ref != NULL) { // found
  945. url = chunk_clone(subj->mem, &ref->url);
  946. title = chunk_clone(subj->mem, &ref->title);
  947. goto match;
  948. } else {
  949. goto noMatch;
  950. }
  951. noMatch:
  952. // If we fall through to here, it means we didn't match a link:
  953. pop_bracket(subj); // remove this opener from delimiter list
  954. subj->pos = initial_pos;
  955. return make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal("]"));
  956. match:
  957. inl = make_simple(subj->mem, is_image ? CMARK_NODE_IMAGE : CMARK_NODE_LINK);
  958. inl->as.link.url = url;
  959. inl->as.link.title = title;
  960. inl->start_line = inl->end_line = subj->line;
  961. inl->start_column = opener->inl_text->start_column;
  962. inl->end_column = subj->pos + subj->column_offset + subj->block_offset;
  963. cmark_node_insert_before(opener->inl_text, inl);
  964. // Add link text:
  965. tmp = opener->inl_text->next;
  966. while (tmp) {
  967. tmpnext = tmp->next;
  968. cmark_node_append_child(inl, tmp);
  969. tmp = tmpnext;
  970. }
  971. // Free the bracket [:
  972. cmark_node_free(opener->inl_text);
  973. process_emphasis(subj, opener->previous_delimiter);
  974. pop_bracket(subj);
  975. // Now, if we have a link, we also want to deactivate earlier link
  976. // delimiters. (This code can be removed if we decide to allow links
  977. // inside links.)
  978. if (!is_image) {
  979. opener = subj->last_bracket;
  980. while (opener != NULL) {
  981. if (!opener->image) {
  982. if (!opener->active) {
  983. break;
  984. } else {
  985. opener->active = false;
  986. }
  987. }
  988. opener = opener->previous;
  989. }
  990. }
  991. return NULL;
  992. }
  993. // Parse a hard or soft linebreak, returning an inline.
  994. // Assumes the subject has a cr or newline at the current position.
  995. static cmark_node *handle_newline(subject *subj) {
  996. bufsize_t nlpos = subj->pos;
  997. // skip over cr, crlf, or lf:
  998. if (peek_at(subj, subj->pos) == '\r') {
  999. advance(subj);
  1000. }
  1001. if (peek_at(subj, subj->pos) == '\n') {
  1002. advance(subj);
  1003. }
  1004. ++subj->line;
  1005. subj->column_offset = -subj->pos;
  1006. // skip spaces at beginning of line
  1007. skip_spaces(subj);
  1008. if (nlpos > 1 && peek_at(subj, nlpos - 1) == ' ' &&
  1009. peek_at(subj, nlpos - 2) == ' ') {
  1010. return make_linebreak(subj->mem);
  1011. } else {
  1012. return make_softbreak(subj->mem);
  1013. }
  1014. }
  1015. static bufsize_t subject_find_special_char(subject *subj, int options) {
  1016. // "\r\n\\`&_*[]<!"
  1017. static const int8_t SPECIAL_CHARS[256] = {
  1018. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1019. 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
  1020. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1021. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1,
  1022. 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1023. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1024. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1025. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1026. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1027. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1028. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
  1029. // " ' . -
  1030. static const char SMART_PUNCT_CHARS[] = {
  1031. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1032. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0,
  1033. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1034. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1035. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1036. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1037. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1038. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1039. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1040. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1041. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1042. };
  1043. bufsize_t n = subj->pos + 1;
  1044. while (n < subj->input.len) {
  1045. if (SPECIAL_CHARS[subj->input.data[n]])
  1046. return n;
  1047. if (options & CMARK_OPT_SMART && SMART_PUNCT_CHARS[subj->input.data[n]])
  1048. return n;
  1049. n++;
  1050. }
  1051. return subj->input.len;
  1052. }
  1053. // Parse an inline, advancing subject, and add it as a child of parent.
  1054. // Return 0 if no inline can be parsed, 1 otherwise.
  1055. static int parse_inline(subject *subj, cmark_node *parent, int options) {
  1056. cmark_node *new_inl = NULL;
  1057. cmark_chunk contents;
  1058. unsigned char c;
  1059. bufsize_t startpos, endpos;
  1060. c = peek_char(subj);
  1061. if (c == 0) {
  1062. return 0;
  1063. }
  1064. switch (c) {
  1065. case '\r':
  1066. case '\n':
  1067. new_inl = handle_newline(subj);
  1068. break;
  1069. case '`':
  1070. new_inl = handle_backticks(subj, options);
  1071. break;
  1072. case '\\':
  1073. new_inl = handle_backslash(subj);
  1074. break;
  1075. case '&':
  1076. new_inl = handle_entity(subj);
  1077. break;
  1078. case '<':
  1079. new_inl = handle_pointy_brace(subj, options);
  1080. break;
  1081. case '*':
  1082. case '_':
  1083. case '\'':
  1084. case '"':
  1085. new_inl = handle_delim(subj, c, (options & CMARK_OPT_SMART) != 0);
  1086. break;
  1087. case '-':
  1088. new_inl = handle_hyphen(subj, (options & CMARK_OPT_SMART) != 0);
  1089. break;
  1090. case '.':
  1091. new_inl = handle_period(subj, (options & CMARK_OPT_SMART) != 0);
  1092. break;
  1093. case '[':
  1094. advance(subj);
  1095. new_inl = make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal("["));
  1096. push_bracket(subj, false, new_inl);
  1097. break;
  1098. case ']':
  1099. new_inl = handle_close_bracket(subj);
  1100. break;
  1101. case '!':
  1102. advance(subj);
  1103. if (peek_char(subj) == '[') {
  1104. advance(subj);
  1105. new_inl = make_str(subj, subj->pos - 2, subj->pos - 1, cmark_chunk_literal("!["));
  1106. push_bracket(subj, true, new_inl);
  1107. } else {
  1108. new_inl = make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal("!"));
  1109. }
  1110. break;
  1111. default:
  1112. endpos = subject_find_special_char(subj, options);
  1113. contents = cmark_chunk_dup(&subj->input, subj->pos, endpos - subj->pos);
  1114. startpos = subj->pos;
  1115. subj->pos = endpos;
  1116. // if we're at a newline, strip trailing spaces.
  1117. if (S_is_line_end_char(peek_char(subj))) {
  1118. cmark_chunk_rtrim(&contents);
  1119. }
  1120. new_inl = make_str(subj, startpos, endpos - 1, contents);
  1121. }
  1122. if (new_inl != NULL) {
  1123. cmark_node_append_child(parent, new_inl);
  1124. }
  1125. return 1;
  1126. }
  1127. // Parse inlines from parent's string_content, adding as children of parent.
  1128. extern void cmark_parse_inlines(cmark_mem *mem, cmark_node *parent,
  1129. cmark_reference_map *refmap, int options) {
  1130. subject subj;
  1131. cmark_chunk content = {parent->content.ptr, parent->content.size, 0};
  1132. subject_from_buf(mem, parent->start_line, parent->start_column - 1 + parent->internal_offset, &subj, &content, refmap);
  1133. cmark_chunk_rtrim(&subj.input);
  1134. while (!is_eof(&subj) && parse_inline(&subj, parent, options))
  1135. ;
  1136. process_emphasis(&subj, NULL);
  1137. // free bracket and delim stack
  1138. while (subj.last_delim) {
  1139. remove_delimiter(&subj, subj.last_delim);
  1140. }
  1141. while (subj.last_bracket) {
  1142. pop_bracket(&subj);
  1143. }
  1144. }
  1145. // Parse zero or more space characters, including at most one newline.
  1146. static void spnl(subject *subj) {
  1147. skip_spaces(subj);
  1148. if (skip_line_end(subj)) {
  1149. skip_spaces(subj);
  1150. }
  1151. }
  1152. // Parse reference. Assumes string begins with '[' character.
  1153. // Modify refmap if a reference is encountered.
  1154. // Return 0 if no reference found, otherwise position of subject
  1155. // after reference is parsed.
  1156. bufsize_t cmark_parse_reference_inline(cmark_mem *mem, cmark_chunk *input,
  1157. cmark_reference_map *refmap) {
  1158. subject subj;
  1159. cmark_chunk lab;
  1160. cmark_chunk url;
  1161. cmark_chunk title;
  1162. bufsize_t matchlen = 0;
  1163. bufsize_t beforetitle;
  1164. subject_from_buf(mem, -1, 0, &subj, input, NULL);
  1165. // parse label:
  1166. if (!link_label(&subj, &lab) || lab.len == 0)
  1167. return 0;
  1168. // colon:
  1169. if (peek_char(&subj) == ':') {
  1170. advance(&subj);
  1171. } else {
  1172. return 0;
  1173. }
  1174. // parse link url:
  1175. spnl(&subj);
  1176. if ((matchlen = manual_scan_link_url(&subj.input, subj.pos, &url)) > -1) {
  1177. subj.pos += matchlen;
  1178. } else {
  1179. return 0;
  1180. }
  1181. // parse optional link_title
  1182. beforetitle = subj.pos;
  1183. spnl(&subj);
  1184. matchlen = subj.pos == beforetitle ? 0 : scan_link_title(&subj.input, subj.pos);
  1185. if (matchlen) {
  1186. title = cmark_chunk_dup(&subj.input, subj.pos, matchlen);
  1187. subj.pos += matchlen;
  1188. } else {
  1189. subj.pos = beforetitle;
  1190. title = cmark_chunk_literal("");
  1191. }
  1192. // parse final spaces and newline:
  1193. skip_spaces(&subj);
  1194. if (!skip_line_end(&subj)) {
  1195. if (matchlen) { // try rewinding before title
  1196. subj.pos = beforetitle;
  1197. skip_spaces(&subj);
  1198. if (!skip_line_end(&subj)) {
  1199. return 0;
  1200. }
  1201. } else {
  1202. return 0;
  1203. }
  1204. }
  1205. // insert reference into refmap
  1206. cmark_reference_create(refmap, &lab, &url, &title);
  1207. return subj.pos;
  1208. }