1 module plain.plain; 2 3 4 import std.algorithm; 5 import std.array; 6 import std.ascii; 7 import std.conv; 8 import std.format; 9 import std.range; 10 import std.regex; 11 import std.string; 12 import std.uni; 13 import std.utf; 14 15 16 import html.dom; 17 18 19 private __gshared auto wordSplitter = ctRegex!(`\s|\n|\t|\r`); 20 21 22 private size_t tagHashOf(const(char)[] x) { 23 size_t hash = 5381; 24 foreach(i; 0..x.length) 25 hash = (hash * 33) ^ cast(size_t)(std.ascii.toLower(x.ptr[i])); 26 return hash; 27 } 28 29 30 private bool isAbsoluteHRef(const(char)[] href) { 31 return (!href.empty && ((href.front == '/') || (href.indexOf("://") >= 0))); 32 } 33 34 35 struct Options { 36 uint wrap = 78; // wrap column 37 uint indent; // global indent 38 dchar listMarker = '*'; // list item decorator 39 string baseHRef; // base URL for local hrefs 40 string[] skipElements; // CSS selector of elements to skip 41 bool solidLinks = true; // keep links in a single line 42 } 43 44 45 private struct TraverseState { 46 uint heading; // inside <h?></h?> 47 uint pre; // inside <pre></pre> 48 uint indent; // current indent for new lines 49 uint wrap; // current wrapping column 50 uint line; // current line length 51 uint index; // list item index 52 uint indexWidth; // list item index width 53 bool skipOneIndent; // skip indent on the first line 54 55 Selector[] skipElements; 56 57 Options options; 58 } 59 60 61 private auto textFormat(Appender)(ref Appender app, HTMLString text, ref TraverseState state, bool link = false) { 62 size_t lines = 1; 63 size_t words = 0; 64 size_t length = state.line; 65 66 foreach(word; text.splitter(wordSplitter)) { 67 if (word.empty) 68 continue; 69 70 if ((words == 0) && !state.line) { 71 if (!state.skipOneIndent) { 72 app.put(' '.repeat(state.indent)); 73 } else { 74 state.skipOneIndent = false; 75 } 76 } 77 ++words; 78 79 size_t wordLength = 0; 80 foreach(d; word.byDchar) 81 ++wordLength; 82 83 auto extra = ((length != 0) && (length != state.wrap)) ? 1 : 0; 84 if (!state.wrap || (length + extra + wordLength <= state.wrap)) { 85 if (extra) { 86 app.put(' '); 87 ++length; 88 } 89 90 app.put(word); 91 length += wordLength; 92 } else { 93 while (wordLength) { 94 app.put('\n'); 95 app.put(' '.repeat(state.indent)); 96 97 if ((link && state.options.solidLinks) || (wordLength <= state.wrap)) { 98 app.put(word); 99 length = wordLength; 100 wordLength = 0; 101 } else { 102 auto indexSplit = word.toUTFindex(state.wrap); 103 app.put(word[0..indexSplit]); 104 word = word[indexSplit..$]; 105 length = state.wrap; 106 wordLength -= state.wrap; 107 } 108 ++lines; 109 } 110 } 111 } 112 113 state.line = cast(uint)length; 114 } 115 116 117 private void traverse(Appender)(ref Appender app, Node node, ref TraverseState state) { 118 final switch (node.type) with (NodeTypes) { 119 case Element: 120 foreach(selector; state.skipElements) { 121 if (selector.matches(node)) 122 return; 123 } 124 125 auto hash = tagHashOf(node.tag); 126 127 switch (hash) { 128 case tagHashOf("a"): 129 auto start = app.data.length; 130 foreach(child; node.children) 131 traverse(app, child, state); 132 auto label = cast(string)app.data[start..$].strip; 133 134 if (label.empty && node.hasAttr("title")) 135 label = cast(string)node.attr("title").strip; 136 137 auto href = node.hasAttr("href") ? node.attr("href").strip : null; 138 if (href == label) 139 href = null; 140 141 if ((href.length >= 7) && (href[0..7] == "mailto:")) { 142 href = href[7..$]; 143 if (href == label) 144 href = null; 145 } else if (!href.empty && (href.front == '#')) { 146 href = null; 147 } 148 149 auto absolute = href.isAbsoluteHRef; 150 151 if (!href.empty) { 152 auto space = (!app.data.empty && (app.data.back != '\n')) ? " " : ""; 153 textFormat(app, format("%s[%s%s]", space, (absolute ? "" : state.options.baseHRef), href), state, true); 154 } 155 break; 156 157 case tagHashOf("br"): 158 app.put('\n'); 159 state.line = 0; 160 break; 161 162 case tagHashOf("img"): 163 auto label = node.hasAttr("alt") ? node.attr("alt").strip : "image"; 164 auto src = node.hasAttr("src") ? node.attr("src").strip : null; 165 auto hasSrc = !src.empty && (((src.length < 4) || (src[0..4] != "cid:")) && ((src.length < 5) || (src[0..5] != "data:"))); 166 167 if (hasSrc) { 168 auto absolute = src.isAbsoluteHRef; 169 textFormat(app, format("[%s %s%s]", label, (absolute ? "" : state.options.baseHRef), src), state, true); 170 } else if (!label.empty) { 171 textFormat(app, format("[%s]", label), state); 172 } 173 break; 174 175 case tagHashOf("h1"): 176 case tagHashOf("h2"): 177 case tagHashOf("h3"): 178 case tagHashOf("h4"): 179 case tagHashOf("h5"): 180 case tagHashOf("h6"): 181 ++state.heading; 182 foreach(child; node.children) 183 traverse(app, child, state); 184 --state.heading; 185 app.put('\n'); 186 state.line = 0; 187 break; 188 189 case tagHashOf("hr"): 190 app.put('\n'); 191 state.line = 0; 192 app.put(' '.repeat(state.indent)); 193 app.put('-'.repeat(state.wrap)); 194 app.put('\n'); 195 state.line = 0; 196 break; 197 198 case tagHashOf("li"): 199 auto liState = state; 200 liState.index = 0; 201 liState.indexWidth = 0; 202 liState.indent = state.indent + state.indexWidth + 3; 203 liState.skipOneIndent = true; 204 liState.wrap = state.wrap ? state.wrap - (state.indexWidth + 3) : 0; 205 206 if (!state.skipOneIndent) 207 app.put(' '.repeat(state.indent)); 208 209 if (state.index) { 210 auto start = app.data.length; 211 formattedWrite(app, " %s. ", state.index); 212 auto len = app.data.length - start - 3; 213 app.put(' '.repeat(state.indexWidth - len)); 214 ++state.index; 215 } else { 216 formattedWrite(app, " %s ", state.options.listMarker); 217 } 218 219 foreach(child; node.children) 220 traverse(app, child, liState); 221 app.put('\n'); 222 break; 223 224 case tagHashOf("ol"): 225 enum itemHash = tagHashOf("li"); 226 227 auto itemCount = 0; 228 foreach(child; node.children) { 229 if (child.type != NodeTypes.Element) 230 continue; 231 232 if (tagHashOf(child.tag) == itemHash) 233 ++itemCount; 234 } 235 236 if (itemCount) { 237 auto olState = state; 238 olState.index = 1; 239 olState.indexWidth = cast(uint)(itemCount.to!string.length); 240 241 foreach(child; node.children) 242 traverse(app, child, olState); 243 } 244 break; 245 246 case tagHashOf("p"): 247 app.put('\n'); 248 state.line = 0; 249 foreach(child; node.children) 250 traverse(app, child, state); 251 app.put('\n'); 252 state.line = 0; 253 break; 254 255 case tagHashOf("pre"): 256 app.put('\n'); 257 state.line = 0; 258 ++state.pre; 259 foreach(child; node.children) 260 traverse(app, child, state); 261 app.put('\n'); 262 --state.pre; 263 state.line = 0; 264 break; 265 266 case tagHashOf("head"): 267 case tagHashOf("script"): 268 case tagHashOf("style"): 269 case tagHashOf("title"): 270 break; 271 272 default: 273 foreach(child; node.children) 274 traverse(app, child, state); 275 break; 276 } 277 break; 278 279 case Text: 280 if (!state.pre) { 281 textFormat(app, state.heading ? node.text.toUpper : node.text, state); 282 } else { 283 app.put(node.text); 284 } 285 break; 286 287 case Comment: 288 case CDATA: 289 case Declaration: 290 case ProcessingInstruction: 291 break; 292 } 293 } 294 295 296 void toplain(Appender)(ref Appender app, Node root, Options options = Options()) { 297 TraverseState state; 298 state.options = options; 299 state.wrap = options.wrap; 300 state.indent = options.indent; 301 302 state.skipElements.reserve(options.skipElements.length); 303 foreach(selector; options.skipElements) { 304 state.skipElements ~= Selector.parse(selector); 305 } 306 307 traverse(app, root, state); 308 } 309 310 311 string toplain(Node root, Options options = Options()) { 312 auto app = appender!string; 313 toplain(app, root, options); 314 return app.data; 315 } 316 317 318 string toplain(ref Document doc, Options options = Options()) { 319 auto app = appender!string; 320 toplain(app, doc.root, options); 321 return app.data; 322 } 323 324 325 string toplain(string html, Options options = Options()) { 326 auto doc = createDocument(html); 327 return toplain(doc, options); 328 }