1 module plain.plain;
2 
3 
4 import std.algorithm;
5 import std.array;
6 import std.ascii;
7 import std.conv;
8 import std.format;
9 import std.range;
10 import std.regex;
11 import std.string;
12 import std.uni;
13 import std.utf;
14 
15 
16 import html.dom;
17 
18 
19 private __gshared auto wordSplitter = ctRegex!(`\s|\n|\t|\r`);
20 
21 
22 private size_t tagHashOf(const(char)[] x) {
23 	size_t hash = 5381;
24 	foreach(i; 0..x.length)
25 		hash = (hash * 33) ^ cast(size_t)(std.ascii.toLower(x.ptr[i]));
26 	return hash;
27 }
28 
29 
30 private bool isAbsoluteHRef(const(char)[] href) {
31 	return (!href.empty && ((href.front == '/') || (href.indexOf("://") >= 0)));
32 }
33 
34 
35 struct Options {
36 	uint wrap = 78;			// wrap column
37 	uint indent;			// global indent
38 	dchar listMarker = '*'; // list item decorator
39 	string baseHRef;		// base URL for local hrefs
40 	string[] skipElements;	// CSS selector of elements to skip
41 	bool solidLinks = true;	// keep links in a single line
42 }
43 
44 
45 private struct TraverseState {
46 	uint heading;				// inside <h?></h?>
47 	uint pre;					// inside <pre></pre>
48 	uint indent;				// current indent for new lines
49 	uint wrap;					// current wrapping column
50 	uint line;					// current line length
51 	uint index;					// list item index
52 	uint indexWidth;			// list item index width
53 	bool skipOneIndent;			// skip indent on the first line
54 
55 	Selector[] skipElements;
56 
57 	Options options;
58 }
59 
60 
61 private auto textFormat(Appender)(ref Appender app, HTMLString text, ref TraverseState state, bool link = false) {
62 	size_t lines = 1;
63 	size_t words = 0;
64 	size_t length = state.line;
65 
66 	foreach(word; text.splitter(wordSplitter)) {
67 		if (word.empty)
68 			continue;
69 
70 		if ((words == 0) && !state.line) {
71 			if (!state.skipOneIndent) {
72 				app.put(' '.repeat(state.indent));
73 			} else {
74 				state.skipOneIndent = false;
75 			}
76 		}
77 		++words;
78 
79 		size_t wordLength = 0;
80 		foreach(d; word.byDchar)
81 			++wordLength;
82 
83 		auto extra = ((length != 0) && (length != state.wrap)) ? 1 : 0;
84 		if (!state.wrap || (length + extra + wordLength <= state.wrap)) {
85 			if (extra) {
86 				app.put(' ');
87 				++length;
88 			}
89 
90 			app.put(word);
91 			length += wordLength;
92 		} else {
93 			while (wordLength) {
94 				app.put('\n');
95 				app.put(' '.repeat(state.indent));
96 
97 				if ((link && state.options.solidLinks) || (wordLength <= state.wrap)) {
98 					app.put(word);
99 					length = wordLength;
100 					wordLength = 0;
101 				} else {
102 					auto indexSplit = word.toUTFindex(state.wrap);
103 					app.put(word[0..indexSplit]);
104 					word = word[indexSplit..$];
105 					length = state.wrap;
106 					wordLength -= state.wrap;
107 				}
108 				++lines;
109 			}
110 		}
111 	}
112 
113 	state.line = cast(uint)length;
114 }
115 
116 
117 private void traverse(Appender)(ref Appender app, Node node, ref TraverseState state) {
118 	final switch (node.type) with (NodeTypes) {
119 	case Element:
120 		foreach(selector; state.skipElements) {
121 			if (selector.matches(node))
122 				return;
123 		}
124 
125 		auto hash = tagHashOf(node.tag);
126 
127 		switch (hash) {
128 		case tagHashOf("a"):
129 			auto start = app.data.length;
130 			foreach(child; node.children)
131 				traverse(app, child, state);
132 			auto label = cast(string)app.data[start..$].strip;
133 
134 			if (label.empty && node.hasAttr("title"))
135 				label = cast(string)node.attr("title").strip;
136 
137 			auto href = node.hasAttr("href") ? node.attr("href").strip : null;
138 			if (href == label)
139 				href = null;
140 
141 			if ((href.length >= 7) && (href[0..7] == "mailto:")) {
142 				href = href[7..$];
143 				if (href == label)
144 					href = null;
145 			} else if (!href.empty && (href.front == '#')) {
146 				href = null;
147 			}
148 
149 			auto absolute = href.isAbsoluteHRef;
150 
151 			if (!href.empty) {
152 				auto space = (!app.data.empty && (app.data.back != '\n')) ? " " : "";
153 				textFormat(app, format("%s[%s%s]", space, (absolute ? "" : state.options.baseHRef), href), state, true);
154 			}
155 			break;
156 
157 		case tagHashOf("br"):
158 			app.put('\n');
159 			state.line = 0;
160 			break;
161 
162 		case tagHashOf("img"):
163 			auto label = node.hasAttr("alt") ? node.attr("alt").strip : "image";
164 			auto src = node.hasAttr("src") ? node.attr("src").strip : null;
165 			auto hasSrc = !src.empty && (((src.length < 4) || (src[0..4] != "cid:")) && ((src.length < 5) || (src[0..5] != "data:")));
166 
167 			if (hasSrc) {
168 				auto absolute = src.isAbsoluteHRef;
169 				textFormat(app, format("[%s %s%s]", label, (absolute ? "" : state.options.baseHRef), src), state, true);
170 			} else if (!label.empty)  {
171 				textFormat(app, format("[%s]", label), state);
172 			}
173 			break;
174 
175 		case tagHashOf("h1"):
176 		case tagHashOf("h2"):
177 		case tagHashOf("h3"):
178 		case tagHashOf("h4"):
179 		case tagHashOf("h5"):
180 		case tagHashOf("h6"):
181 			++state.heading;
182 			foreach(child; node.children)
183 				traverse(app, child, state);
184 			--state.heading;
185 			app.put('\n');
186 			state.line = 0;
187 			break;
188 
189 		case tagHashOf("hr"):
190 			app.put('\n');
191 			state.line = 0;
192 			app.put(' '.repeat(state.indent));
193 			app.put('-'.repeat(state.wrap));
194 			app.put('\n');
195 			state.line = 0;
196 			break;
197 
198 		case tagHashOf("li"):
199 			auto liState = state;
200 			liState.index = 0;
201 			liState.indexWidth = 0;
202 			liState.indent = state.indent + state.indexWidth + 3;
203 			liState.skipOneIndent = true;
204 			liState.wrap = state.wrap ? state.wrap - (state.indexWidth + 3) : 0;
205 
206 			if (!state.skipOneIndent)
207 				app.put(' '.repeat(state.indent));
208 
209 			if (state.index) {
210 				auto start = app.data.length;
211 				formattedWrite(app, " %s. ", state.index);
212 				auto len = app.data.length - start - 3;
213 				app.put(' '.repeat(state.indexWidth - len));
214 				++state.index;
215 			} else {
216 				formattedWrite(app, " %s ", state.options.listMarker);
217 			}
218 
219 			foreach(child; node.children)
220 				traverse(app, child, liState);
221 			app.put('\n');
222 			break;
223 
224 		case tagHashOf("ol"):
225 			enum itemHash = tagHashOf("li");
226 
227 			auto itemCount = 0;
228 			foreach(child; node.children) {
229 				if (child.type != NodeTypes.Element)
230 					continue;
231 
232 				if (tagHashOf(child.tag) == itemHash)
233 					++itemCount;
234 			}
235 
236 			if (itemCount) {
237 				auto olState = state;
238 				olState.index = 1;
239 				olState.indexWidth = cast(uint)(itemCount.to!string.length);
240 
241 				foreach(child; node.children)
242 					traverse(app, child, olState);
243 			}
244 			break;
245 
246 		case tagHashOf("p"):
247 			app.put('\n');
248 			state.line = 0;
249 			foreach(child; node.children)
250 				traverse(app, child, state);
251 			app.put('\n');
252 			state.line = 0;
253 			break;
254 
255 		case tagHashOf("pre"):
256 			app.put('\n');
257 			state.line = 0;
258 			++state.pre;
259 			foreach(child; node.children)
260 				traverse(app, child, state);
261 			app.put('\n');
262 			--state.pre;
263 			state.line = 0;
264 			break;
265 
266 		case tagHashOf("head"):
267 		case tagHashOf("script"):
268 		case tagHashOf("style"):
269 		case tagHashOf("title"):
270 			break;
271 
272 		default:
273 			foreach(child; node.children)
274 				traverse(app, child, state);
275 			break;
276 		}
277 		break;
278 
279 	case Text:
280 		if (!state.pre) {
281 			textFormat(app, state.heading ? node.text.toUpper : node.text, state);
282 		} else {
283 			app.put(node.text);
284 		}
285 		break;
286 
287 	case Comment:
288 	case CDATA:
289 	case Declaration:
290 	case ProcessingInstruction:
291 		break;
292 	}
293 }
294 
295 
296 void toplain(Appender)(ref Appender app, Node root, Options options = Options()) {
297 	TraverseState state;
298 	state.options = options;
299 	state.wrap = options.wrap;
300 	state.indent = options.indent;
301 
302 	state.skipElements.reserve(options.skipElements.length);
303 	foreach(selector; options.skipElements) {
304 		state.skipElements ~= Selector.parse(selector);
305 	}
306 
307 	traverse(app, root, state);
308 }
309 
310 
311 string toplain(Node root, Options options = Options()) {
312 	auto app = appender!string;
313 	toplain(app, root, options);
314 	return app.data;
315 }
316 
317 
318 string toplain(ref Document doc, Options options = Options()) {
319 	auto app = appender!string;
320 	toplain(app, doc.root, options);
321 	return app.data;
322 }
323 
324 
325 string toplain(string html, Options options = Options()) {
326 	auto doc = createDocument(html);
327 	return toplain(doc, options);
328 }