/* reassess: generates an RSS feed given a directory.
To run, simply pass in the directory containing all entries, and the
intended target RSS location. For instance:
reassess $WEBSITE_URL/blog/entries $WEBSITE_URL/blog/rss.xml
Then, you are set! Remember to re-run this program whenever an update is
made to the website.
The location of the metadata can be derived either from the file itself, or
from an rssmeta.xml file, which should contain all the metadata for the
relevant files.
If not, please provide the metadata within the file.
reassess requires the following bits of metadata to be provided:
TAG derived from?
---------|-------------
-
rssmeta:title OR OR OR first
tag
- rssmeta:link OR assessrc:global_link OR
- rssmeta:pubDate OR OR
- rssmeta:guid OR OR the same as .
- Everything within .
It is best to provide an rssmeta.xml or rssmeta.json file, but failing this
all information can be derived from the body of the document if correctly
written.
reassess will fail if:
- There is both an rssmeta.xml and an rssmeta.json file in $1.
- The rssmeta file is corrupted or incorrectly formatted.
- There is insufficient information to populate a metadata field.
etc.
In all these cases, the program will output to stderr and return a non-zero
return.
Optional Flags:
-m Gives the location of the rssmeta.xml or rssmeta.json if it is not
present in the directory to be converted.
`reassess $WEBSITE_URL/blog/entries $WEBSITE_URL/blog/rss.xml -m $METAFILE`
-i Errors which to ignore.
By default, nothing is ignored.
-o Order: in-order; reverse; title-alphabetical; more TBA.
Default is reverse, which gives most recent entry first.
Format of rssmeta.xml:
...
... the metadata entries, e.g. , , etc.
The attribute is used to find the file in particular. An error
is thrown if:
- There exists a file in the directory which is not present in the rssmeta
file (You can pass in `-i file-wo-entry` to ignore this check, and skip
said file when outputting RSS.)
- There exists a file in the rssmeta file which is not in the directory.
(You can pass in `-i entry-wo-file` to ignore this.
*/
import std.stdio;
import std.algorithm : canFind, filter, findSplitAfter;
import std.file : dirEntries, DirEntry, readText, SpanMode;
import core.stdc.stdlib : exit;
import html;
import rssmeta;
void usage()
{
writeln(
"reassess: create an RSS feed out of HTML documents.\n" ~
"Usage: \n" ~
"\treassess DIRECTORY TARGET (FLAGS)\n" ~
"DIRECTORY: The directory where the HTML files are sourced.\n" ~
"TARGET: The desired target location of the RSS file.\n" ~
"Flags:\n" ~
"\t-h, --help Display this help message.\n" ~
"\t-m, --metafile The path of the metafile if not in DIRECTORY.\n" ~
"\t-r, --recursive Whether to recurse DIRECTORY or not.\n" ~
"\t-f, --follow Whether to follow symlinks in DIRECTORY or not.\n" ~
"\t-o, --order The order to output the RSS entries.\n" ~
"\t Options: reverse; forward; alphabetic-by-title.\n" ~
"\t Default to `reverse` if unspecified.\n" ~
"\t-c, --config Options to configure functionality. Details TODO\n"
);
exit(0);
}
// TODO: make ctor so that pubDateAsUnixTimestamp gets populated.
struct Entry
{
string title;
string link;
string pubDate;
string guid;
string description; // The actual entry itself.
private:
int pubDateAsUnixTimestamp; // So that we can sort easily.
}
Entry make_entry(DirEntry dirEntry, RssMeta rssmeta)
{
// From the DirEntry, we can get all file info, and also read the full file
// since we know its path. We just need to remember, when we read the
// entry, to only read between the and tags, whilst
// searching the full file for content, and replace all bad characters with
// escaped ones.
// Or, if does not work, we could try between some comments.
// I say, and would meet those criteria.
// COMPLETE
string get_title(DirEntry dirEntry, in string text, in RssMeta rssmeta)
{
// Title is either:
// 1) rssmeta:
// 2) comment in text.
// 3) TITLE from text.
string title = rssmeta.title_of(dirEntry.name);
if (title !is null)
return title;
title = find_rss_comment("title", text);
if (title !is null)
return title;
title = find_html_title(text);
return title; // even if it's null.
}
// COMPLETE
string get_link(DirEntry dirEntry, in string text, in RssMeta rssmeta)
{
// Link is either:
// 1) rssmeta:
// 2) comment in text.
string link = rssmeta.link_of(dirEntry.name);
if (link !is null)
return link;
link = find_rss_comment("link", text);
return link;
}
// Output is formatted as ISO-8601.
string get_pubdate(DirEntry dirEntry, in string text, in RssMeta rssmeta)
{
// The date must be formatted YYYY-mm-dd HH:MM.
// pubDate is either:
// 1) rssmeta:
// 2) comment in text.
// 3) the Unix timestamp of the file.
string pubDate = rssmeta.title_of(dirEntry.name);
if (pubDate !is null)
return pubDate;
pubDate = find_rss_comment("pubDate", text);
if (pubDate !is null)
return pubDate;
// TODO: add in getting date from the dirEntry.time.
return pubDate; // even if it's null.
}
// COMPLETE
string get_guid(DirEntry dirEntry, in string text, in RssMeta rssmeta)
{
// guid is either:
// 1) rssmeta:
// 2) comment in text.
// 3) delegated to get_link as a last resort.
string guid = rssmeta.guid_of(dirEntry.name);
if (guid !is null)
return guid;
guid = find_rss_comment("guid", text);
if (guid !is null)
return guid;
return get_link(dirEntry, text, rssmeta);
}
void fail(string what)
{
writeln("Error occurred when retrieving attribute ", what, " in make_entry");
exit(-1);
}
string text = readText(dirEntry.name);
string title = get_title(dirEntry, text, rssmeta);
string link = get_link(dirEntry, text, rssmeta);
string pubDate = get_pubdate(dirEntry, text, rssmeta);
string guid = get_guid(dirEntry, text, rssmeta);
if (text is null)
fail("text");
if (title is null)
fail("title");
if (link is null)
fail("link");
if (pubDate is null)
fail("pubDate");
if (guid is null)
fail("guid");
Entry ret = Entry(text, title, link, pubDate, guid);
return ret;
}
// Flags which are passed to -i (--ignore).
immutable FILE_WO_ENTRY = "file-wo-entry";
immutable ENTRY_WO_FILE = "entry-wo-file";
immutable DASH_I_PARAMS = [ FILE_WO_ENTRY, ENTRY_WO_FILE ];
int main(string[] args)
{
/* Flags */
string metafile;
bool entries_set = false;
string entries_dir;
string target;
string[] ignore;
bool recursive = false;
bool follow_symlinks = false;
for (int i = 0; i < args.length; ++i)
{
if (args[i] == "-m" || args[i] == "--metafile")
metafile = args[++i];
else if (args[i] == "-h" || args[i] == "--help")
usage();
else if (args[i] == "-r" || args[i] == "--recursive")
recursive = true;
else if (args[i] == "-f" || args[i] == "--follow")
follow_symlinks = true;
else if (args[i] == "-o" || args[i] == "--order")
{}
else if (args[i] == "-i" || args[i] == "--ignore")
{
auto param = args[++i];
if (DASH_I_PARAMS.canFind(param))
ignore ~= param;
else
{
writeln("Bad param for -i:", param);
exit(-1);
}
}
else
{
if (!entries_set)
{
entries_set = true;
entries_dir = args[i];
}
else
target = args[i];
}
}
// Get RssMeta.
// TODO: also make it so it can get the metafile from the current dir
// and rssmeta.xml xor rssmeta.json files.
RssMeta rssmeta = RssMeta(metafile);
// Grab list of files in directory.
DirEntry[] objects = [];
foreach (DirEntry file; dirEntries(entries_dir, ".html",
recursive? SpanMode.depth : SpanMode.shallow,
follow_symlinks)
.filter!(f => f.isFile())
)
{
objects ~= file;
}
// Convert the file, given all info for the metadata, into an Entry.
Entry[] entries;
foreach (DirEntry dirEntry; objects)
{
entries ~= make_entry(dirEntry, rssmeta);
}
// Iterate over all entries in the order as described by -o flag, and
// create the entry for them. This gets spat back out as a big string.
// auto entries = create_entries(...);
// Now, write all the fluff around the entries, and jobs-almost-agoodun.
// Finally, write back the full RSS XML into the file specified by $2.
return 0;
}