houdini_href_e.c 2.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100
  1. #include <assert.h>
  2. #include <stdio.h>
  3. #include <string.h>
  4. #include "houdini.h"
  5. /*
  6. * The following characters will not be escaped:
  7. *
  8. * -_.+!*'(),%#@?=;:/,+&$ alphanum
  9. *
  10. * Note that this character set is the addition of:
  11. *
  12. * - The characters which are safe to be in an URL
  13. * - The characters which are *not* safe to be in
  14. * an URL because they are RESERVED characters.
  15. *
  16. * We assume (lazily) that any RESERVED char that
  17. * appears inside an URL is actually meant to
  18. * have its native function (i.e. as an URL
  19. * component/separator) and hence needs no escaping.
  20. *
  21. * There are two exceptions: the chacters & (amp)
  22. * and ' (single quote) do not appear in the table.
  23. * They are meant to appear in the URL as components,
  24. * yet they require special HTML-entity escaping
  25. * to generate valid HTML markup.
  26. *
  27. * All other characters will be escaped to %XX.
  28. *
  29. */
  30. static const char HREF_SAFE[] = {
  31. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  32. 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
  33. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  34. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1,
  35. 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  36. 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  37. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  38. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  39. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  40. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  41. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  42. };
  43. int houdini_escape_href(cmark_strbuf *ob, const uint8_t *src, bufsize_t size) {
  44. static const uint8_t hex_chars[] = "0123456789ABCDEF";
  45. bufsize_t i = 0, org;
  46. uint8_t hex_str[3];
  47. hex_str[0] = '%';
  48. while (i < size) {
  49. org = i;
  50. while (i < size && HREF_SAFE[src[i]] != 0)
  51. i++;
  52. if (likely(i > org))
  53. cmark_strbuf_put(ob, src + org, i - org);
  54. /* escaping */
  55. if (i >= size)
  56. break;
  57. switch (src[i]) {
  58. /* amp appears all the time in URLs, but needs
  59. * HTML-entity escaping to be inside an href */
  60. case '&':
  61. cmark_strbuf_puts(ob, "&amp;");
  62. break;
  63. /* the single quote is a valid URL character
  64. * according to the standard; it needs HTML
  65. * entity escaping too */
  66. case '\'':
  67. cmark_strbuf_puts(ob, "&#x27;");
  68. break;
  69. /* the space can be escaped to %20 or a plus
  70. * sign. we're going with the generic escape
  71. * for now. the plus thing is more commonly seen
  72. * when building GET strings */
  73. #if 0
  74. case ' ':
  75. cmark_strbuf_putc(ob, '+');
  76. break;
  77. #endif
  78. /* every other character goes with a %XX escaping */
  79. default:
  80. hex_str[1] = hex_chars[(src[i] >> 4) & 0xF];
  81. hex_str[2] = hex_chars[src[i] & 0xF];
  82. cmark_strbuf_put(ob, hex_str, 3);
  83. }
  84. i++;
  85. }
  86. return 1;
  87. }