diff options
| author | George Abbott <george@gabbott.dev> | 2023-10-04 20:20:57 +0100 | 
|---|---|---|
| committer | George Abbott <george@gabbott.dev> | 2023-10-04 20:20:57 +0100 | 
| commit | 08cec360e16615b1ae157e4926e7317d16765f7e (patch) | |
| tree | 3a03e0a1acf89973de92e7f6f7a08b2a587b7485 | |
| parent | 8e4092347975db7fe90b0895f605bdd79de47410 (diff) | |
Progress thus far
| -rw-r--r-- | dub.selections.json | 6 | ||||
| -rw-r--r-- | source/app.d | 172 | ||||
| -rw-r--r-- | source/html.d | 38 | ||||
| -rw-r--r-- | source/rssmeta.d | 13 | 
4 files changed, 204 insertions, 25 deletions
| diff --git a/dub.selections.json b/dub.selections.json new file mode 100644 index 0000000..70f9033 --- /dev/null +++ b/dub.selections.json @@ -0,0 +1,6 @@ +{ +	"fileVersion": 1, +	"versions": { +		"dxml": "0.4.4" +	} +} diff --git a/source/app.d b/source/app.d index 3c913c3..97630d2 100644 --- a/source/app.d +++ b/source/app.d @@ -56,22 +56,30 @@     */ +import std.stdio; +import std.algorithm : canFind, filter, findSplitAfter; +import std.file : dirEntries, DirEntry, readText, SpanMode; +import core.stdc.stdlib : exit; + +import html; +import rssmeta; +  void usage()  {  	writeln( -			"reassess: create an RSS feed out of HTML documents.\n" -			"Usage: \n" -			"\treassess DIRECTORY TARGET (FLAGS)\n" -			"DIRECTORY: The directory where the HTML files are sourced.\n" -			"TARGET:    The desired target location of the RSS file.\n" -			"Flags:\n" -			"\t-h, --help      Display this help message.\n" -			"\t-m, --metafile  The path of the metafile if not in DIRECTORY.\n" -			"\t-r, --recursive Whether to recurse DIRECTORY or not.\n" -			"\t-f, --follow    Whether to follow symlinks in DIRECTORY or not.\n" -			"\t-o, --order     The order to output the RSS entries.\n" -			"\t                Options: reverse; forward; alphabetic-by-title.\n" -			"\t                Default to `reverse` if unspecified.\n" +			"reassess: create an RSS feed out of HTML documents.\n" ~ +			"Usage: \n" ~ +			"\treassess DIRECTORY TARGET (FLAGS)\n" ~ +			"DIRECTORY: The directory where the HTML files are sourced.\n" ~ +			"TARGET:    The desired target location of the RSS file.\n" ~  +			"Flags:\n" ~  +			"\t-h, --help      Display this help message.\n" ~  +			"\t-m, --metafile  The path of the metafile if not in DIRECTORY.\n" ~ +			"\t-r, --recursive Whether to recurse DIRECTORY or not.\n" ~  +			"\t-f, --follow    Whether to follow symlinks in DIRECTORY or not.\n" ~ +			"\t-o, --order     The order to output the RSS entries.\n" ~ +			"\t                Options: reverse; forward; alphabetic-by-title.\n" ~ +			"\t                Default to `reverse` if unspecified.\n" ~  			"\t-c, --config    Options to configure functionality. Details TODO\n"  		   );  	exit(0); @@ -79,6 +87,7 @@ void usage() +// TODO: make ctor so that pubDateAsUnixTimestamp gets populated.  struct Entry  {  	string title; @@ -91,7 +100,8 @@ private:  	int pubDateAsUnixTimestamp; // So that we can sort easily.  } -Entry make_entry(DirEntry dirEntry) + +Entry make_entry(DirEntry dirEntry, RssMeta rssmeta)  {  	// From the DirEntry, we can get all file info, and also read the full file  	// since we know its path. We just need to remember, when we read the @@ -100,14 +110,115 @@ Entry make_entry(DirEntry dirEntry)  	// escaped ones.  	// Or, if <body> does not work, we could try between some comments.   	// I say, <!-- bct:beg --> and <!-- bct:end --> would meet those criteria. -} +	// COMPLETE +	string get_title(DirEntry dirEntry, in string text, in RssMeta rssmeta) +	{ +		// Title is either: +		// 1) rssmeta:<title> +		// 2) <!-- rss-title: TITLE --> comment in text. +		// 3) <title>TITLE</title> from text. +		string title = rssmeta.title_of(dirEntry.name); +		if (title !is null) +			return title; +		title = find_rss_comment("title", text); +		if (title !is null) +			return title; +		title = find_html_title(text); +		return title; // even if it's null.  +	} -import std.stdio; -import std.algorithm : canFind; -import std.file : dirEntries, DirEntry; +	// COMPLETE +	string get_link(DirEntry dirEntry, in string text, in RssMeta rssmeta) +	{ +		// Link is either: +		// 1) rssmeta:<link> +		// 2) <!-- rss-link: link --> comment in text. + +		string link = rssmeta.link_of(dirEntry.name); +		if (link !is null) +			return link; + +		link = find_rss_comment("link", text); +		return link; +	} + +	// Output is formatted as ISO-8601. +	string get_pubdate(DirEntry dirEntry, in string text, in RssMeta rssmeta) +	{ +		// The date must be formatted YYYY-mm-dd HH:MM.  +		// pubDate is either: +		// 1) rssmeta:<pubDate> +		// 2) <!-- rss-pubDate: TITLE --> comment in text. +		// 3) the Unix timestamp of the file. + +		string pubDate = rssmeta.title_of(dirEntry.name); +		if (pubDate !is null) +			return pubDate; + +		pubDate = find_rss_comment("pubDate", text); +		if (pubDate !is null) +			return pubDate; + +		// TODO: add in getting date from the dirEntry.time. + +		return pubDate; // even if it's null.  +	} + +	// COMPLETE +	string get_guid(DirEntry dirEntry, in string text, in RssMeta rssmeta) +	{ +		// guid is either: +		// 1) rssmeta:<guid> +		// 2) <!-- rss-guid: guid --> comment in text. +		// 3) delegated to get_link as a last resort. + +		string guid = rssmeta.guid_of(dirEntry.name); +		if (guid !is null) +			return guid; + +		guid = find_rss_comment("guid", text); +		if (guid !is null) +			return guid; + +		return get_link(dirEntry, text, rssmeta); +	} + +	void fail(string what) +	{ +		writeln("Error occurred when retrieving attribute ", what, " in make_entry"); +		exit(-1); +	} + + +	string text    = readText(dirEntry.name); +	string title   = get_title(dirEntry, text, rssmeta); +	string link    = get_link(dirEntry, text, rssmeta); +	string pubDate = get_pubdate(dirEntry, text, rssmeta); +	string guid    = get_guid(dirEntry, text, rssmeta); + +	if (text is null) +		fail("text"); + +	if (title is null) +		fail("title"); + +	if (link is null) +		fail("link"); + +	if (pubDate is null) +		fail("pubDate"); + +	if (guid is null) +		fail("guid"); + +	Entry ret = Entry(text, title, link, pubDate, guid); +	return ret; + + +}  // Flags which are passed to -i (--ignore).  immutable FILE_WO_ENTRY = "file-wo-entry"; @@ -119,7 +230,7 @@ int main(string[] args)  	/* Flags */  	string metafile;  	bool entries_set = false; -	string entries; +	string entries_dir;  	string target;  	string[] ignore;  	bool recursive = false; @@ -137,6 +248,7 @@ int main(string[] args)  		else if (args[i] == "-f" || args[i] == "--follow")  			follow_symlinks = true;  		else if (args[i] == "-o" || args[i] == "--order") +		{}  		else if (args[i] == "-i" || args[i] == "--ignore")  		{  			auto param = args[++i]; @@ -154,19 +266,25 @@ int main(string[] args)  			if (!entries_set)  			{  				entries_set = true; -				entries = metafile[i]; +				entries_dir = args[i];  			}  			else -				target = metafile[i]; +				target = args[i];  		}  	} +	// Get RssMeta. +	// TODO: also make it so it can get the metafile from the current dir  +	// and rssmeta.xml xor rssmeta.json files. +	RssMeta rssmeta = RssMeta(metafile); + +  	// Grab list of files in directory.  	DirEntry[] objects = [];  -	foreach (DirEntry file; dirEntries(entries, ".html",  +	foreach (DirEntry file; dirEntries(entries_dir, ".html",   				recursive? SpanMode.depth : SpanMode.shallow,  				follow_symlinks) -			.filter!(file => f.isFile()) +			.filter!(f => f.isFile())  			)   	{  		objects ~= file; @@ -177,15 +295,19 @@ int main(string[] args)  	Entry[] entries;  	foreach (DirEntry dirEntry; objects)  	{ -		entries ~= make_entry(dirEntry); +		entries ~= make_entry(dirEntry, rssmeta);  	}  	// Iterate over all entries in the order as described by -o flag, and   	// create the entry for them. This gets spat back out as a big string. -	auto entries = create_entries(...); +	// auto entries = create_entries(...);  	// Now, write all the fluff around the entries, and jobs-almost-agoodun.  	// Finally, write back the full RSS XML into the file specified by $2. + + + +	return 0;  } diff --git a/source/html.d b/source/html.d new file mode 100644 index 0000000..768e270 --- /dev/null +++ b/source/html.d @@ -0,0 +1,38 @@ +// Represents all functions for process the raw HTML and  +// extracting information from it, transforming it, etc. +// All fns operate on strings. + +import std.algorithm : findSplitAfter, find; + + +// Given the haystack (the full HTML) search for the RSS comment +// representing that attribute. +string find_rss_comment(string attr, string haystack) +in (haystack !is null) +{ +	immutable string needle = "<!-- rss-" ~ attr ~ ":";  +	string found  = haystack.find(needle); + +	if (found is null) +		return null; + +	// As otherwise comment begin will be in `found`. +	found = found[needle.length .. $]; + +	// And get before the closing comment. +	string result = found.findSplitAfter("-->")[0]; +	return result; +} + +string find_html_title(string htmltext) +{ +	// TODO: implement. +	immutable string init_tag = "<title>"; +	string found = htmltext.find(init_tag); +	if (found is null) +		return null; + +	found = found[init_tag.length .. $]; +	string result = found.findSplitAfter("</title>")[0]; +	return result; +} diff --git a/source/rssmeta.d b/source/rssmeta.d new file mode 100644 index 0000000..6fec5b4 --- /dev/null +++ b/source/rssmeta.d @@ -0,0 +1,13 @@ +// Represents the RssMeta struct, containing the metadata with some fns to  +// access it easily. + +struct RssMeta +{ +	// TODO: add the magic. + +	// filename: either just name or fully qualified should work. +	string title_of(string filename) const; +	string link_of(string filename) const; +	string pubdate_of(string filename) const; +	string guid_of(string filename) const; +} | 
