diff options
author | George Abbott <george@gabbott.dev> | 2023-10-04 20:20:57 +0100 |
---|---|---|
committer | George Abbott <george@gabbott.dev> | 2023-10-04 20:20:57 +0100 |
commit | 08cec360e16615b1ae157e4926e7317d16765f7e (patch) | |
tree | 3a03e0a1acf89973de92e7f6f7a08b2a587b7485 | |
parent | 8e4092347975db7fe90b0895f605bdd79de47410 (diff) |
Progress thus far
-rw-r--r-- | dub.selections.json | 6 | ||||
-rw-r--r-- | source/app.d | 172 | ||||
-rw-r--r-- | source/html.d | 38 | ||||
-rw-r--r-- | source/rssmeta.d | 13 |
4 files changed, 204 insertions, 25 deletions
diff --git a/dub.selections.json b/dub.selections.json new file mode 100644 index 0000000..70f9033 --- /dev/null +++ b/dub.selections.json @@ -0,0 +1,6 @@ +{ + "fileVersion": 1, + "versions": { + "dxml": "0.4.4" + } +} diff --git a/source/app.d b/source/app.d index 3c913c3..97630d2 100644 --- a/source/app.d +++ b/source/app.d @@ -56,22 +56,30 @@ */ +import std.stdio; +import std.algorithm : canFind, filter, findSplitAfter; +import std.file : dirEntries, DirEntry, readText, SpanMode; +import core.stdc.stdlib : exit; + +import html; +import rssmeta; + void usage() { writeln( - "reassess: create an RSS feed out of HTML documents.\n" - "Usage: \n" - "\treassess DIRECTORY TARGET (FLAGS)\n" - "DIRECTORY: The directory where the HTML files are sourced.\n" - "TARGET: The desired target location of the RSS file.\n" - "Flags:\n" - "\t-h, --help Display this help message.\n" - "\t-m, --metafile The path of the metafile if not in DIRECTORY.\n" - "\t-r, --recursive Whether to recurse DIRECTORY or not.\n" - "\t-f, --follow Whether to follow symlinks in DIRECTORY or not.\n" - "\t-o, --order The order to output the RSS entries.\n" - "\t Options: reverse; forward; alphabetic-by-title.\n" - "\t Default to `reverse` if unspecified.\n" + "reassess: create an RSS feed out of HTML documents.\n" ~ + "Usage: \n" ~ + "\treassess DIRECTORY TARGET (FLAGS)\n" ~ + "DIRECTORY: The directory where the HTML files are sourced.\n" ~ + "TARGET: The desired target location of the RSS file.\n" ~ + "Flags:\n" ~ + "\t-h, --help Display this help message.\n" ~ + "\t-m, --metafile The path of the metafile if not in DIRECTORY.\n" ~ + "\t-r, --recursive Whether to recurse DIRECTORY or not.\n" ~ + "\t-f, --follow Whether to follow symlinks in DIRECTORY or not.\n" ~ + "\t-o, --order The order to output the RSS entries.\n" ~ + "\t Options: reverse; forward; alphabetic-by-title.\n" ~ + "\t Default to `reverse` if unspecified.\n" ~ "\t-c, --config Options to configure functionality. Details TODO\n" ); exit(0); @@ -79,6 +87,7 @@ void usage() +// TODO: make ctor so that pubDateAsUnixTimestamp gets populated. struct Entry { string title; @@ -91,7 +100,8 @@ private: int pubDateAsUnixTimestamp; // So that we can sort easily. } -Entry make_entry(DirEntry dirEntry) + +Entry make_entry(DirEntry dirEntry, RssMeta rssmeta) { // From the DirEntry, we can get all file info, and also read the full file // since we know its path. We just need to remember, when we read the @@ -100,14 +110,115 @@ Entry make_entry(DirEntry dirEntry) // escaped ones. // Or, if <body> does not work, we could try between some comments. // I say, <!-- bct:beg --> and <!-- bct:end --> would meet those criteria. -} + // COMPLETE + string get_title(DirEntry dirEntry, in string text, in RssMeta rssmeta) + { + // Title is either: + // 1) rssmeta:<title> + // 2) <!-- rss-title: TITLE --> comment in text. + // 3) <title>TITLE</title> from text. + string title = rssmeta.title_of(dirEntry.name); + if (title !is null) + return title; + title = find_rss_comment("title", text); + if (title !is null) + return title; + title = find_html_title(text); + return title; // even if it's null. + } -import std.stdio; -import std.algorithm : canFind; -import std.file : dirEntries, DirEntry; + // COMPLETE + string get_link(DirEntry dirEntry, in string text, in RssMeta rssmeta) + { + // Link is either: + // 1) rssmeta:<link> + // 2) <!-- rss-link: link --> comment in text. + + string link = rssmeta.link_of(dirEntry.name); + if (link !is null) + return link; + + link = find_rss_comment("link", text); + return link; + } + + // Output is formatted as ISO-8601. + string get_pubdate(DirEntry dirEntry, in string text, in RssMeta rssmeta) + { + // The date must be formatted YYYY-mm-dd HH:MM. + // pubDate is either: + // 1) rssmeta:<pubDate> + // 2) <!-- rss-pubDate: TITLE --> comment in text. + // 3) the Unix timestamp of the file. + + string pubDate = rssmeta.title_of(dirEntry.name); + if (pubDate !is null) + return pubDate; + + pubDate = find_rss_comment("pubDate", text); + if (pubDate !is null) + return pubDate; + + // TODO: add in getting date from the dirEntry.time. + + return pubDate; // even if it's null. + } + + // COMPLETE + string get_guid(DirEntry dirEntry, in string text, in RssMeta rssmeta) + { + // guid is either: + // 1) rssmeta:<guid> + // 2) <!-- rss-guid: guid --> comment in text. + // 3) delegated to get_link as a last resort. + + string guid = rssmeta.guid_of(dirEntry.name); + if (guid !is null) + return guid; + + guid = find_rss_comment("guid", text); + if (guid !is null) + return guid; + + return get_link(dirEntry, text, rssmeta); + } + + void fail(string what) + { + writeln("Error occurred when retrieving attribute ", what, " in make_entry"); + exit(-1); + } + + + string text = readText(dirEntry.name); + string title = get_title(dirEntry, text, rssmeta); + string link = get_link(dirEntry, text, rssmeta); + string pubDate = get_pubdate(dirEntry, text, rssmeta); + string guid = get_guid(dirEntry, text, rssmeta); + + if (text is null) + fail("text"); + + if (title is null) + fail("title"); + + if (link is null) + fail("link"); + + if (pubDate is null) + fail("pubDate"); + + if (guid is null) + fail("guid"); + + Entry ret = Entry(text, title, link, pubDate, guid); + return ret; + + +} // Flags which are passed to -i (--ignore). immutable FILE_WO_ENTRY = "file-wo-entry"; @@ -119,7 +230,7 @@ int main(string[] args) /* Flags */ string metafile; bool entries_set = false; - string entries; + string entries_dir; string target; string[] ignore; bool recursive = false; @@ -137,6 +248,7 @@ int main(string[] args) else if (args[i] == "-f" || args[i] == "--follow") follow_symlinks = true; else if (args[i] == "-o" || args[i] == "--order") + {} else if (args[i] == "-i" || args[i] == "--ignore") { auto param = args[++i]; @@ -154,19 +266,25 @@ int main(string[] args) if (!entries_set) { entries_set = true; - entries = metafile[i]; + entries_dir = args[i]; } else - target = metafile[i]; + target = args[i]; } } + // Get RssMeta. + // TODO: also make it so it can get the metafile from the current dir + // and rssmeta.xml xor rssmeta.json files. + RssMeta rssmeta = RssMeta(metafile); + + // Grab list of files in directory. DirEntry[] objects = []; - foreach (DirEntry file; dirEntries(entries, ".html", + foreach (DirEntry file; dirEntries(entries_dir, ".html", recursive? SpanMode.depth : SpanMode.shallow, follow_symlinks) - .filter!(file => f.isFile()) + .filter!(f => f.isFile()) ) { objects ~= file; @@ -177,15 +295,19 @@ int main(string[] args) Entry[] entries; foreach (DirEntry dirEntry; objects) { - entries ~= make_entry(dirEntry); + entries ~= make_entry(dirEntry, rssmeta); } // Iterate over all entries in the order as described by -o flag, and // create the entry for them. This gets spat back out as a big string. - auto entries = create_entries(...); + // auto entries = create_entries(...); // Now, write all the fluff around the entries, and jobs-almost-agoodun. // Finally, write back the full RSS XML into the file specified by $2. + + + + return 0; } diff --git a/source/html.d b/source/html.d new file mode 100644 index 0000000..768e270 --- /dev/null +++ b/source/html.d @@ -0,0 +1,38 @@ +// Represents all functions for process the raw HTML and +// extracting information from it, transforming it, etc. +// All fns operate on strings. + +import std.algorithm : findSplitAfter, find; + + +// Given the haystack (the full HTML) search for the RSS comment +// representing that attribute. +string find_rss_comment(string attr, string haystack) +in (haystack !is null) +{ + immutable string needle = "<!-- rss-" ~ attr ~ ":"; + string found = haystack.find(needle); + + if (found is null) + return null; + + // As otherwise comment begin will be in `found`. + found = found[needle.length .. $]; + + // And get before the closing comment. + string result = found.findSplitAfter("-->")[0]; + return result; +} + +string find_html_title(string htmltext) +{ + // TODO: implement. + immutable string init_tag = "<title>"; + string found = htmltext.find(init_tag); + if (found is null) + return null; + + found = found[init_tag.length .. $]; + string result = found.findSplitAfter("</title>")[0]; + return result; +} diff --git a/source/rssmeta.d b/source/rssmeta.d new file mode 100644 index 0000000..6fec5b4 --- /dev/null +++ b/source/rssmeta.d @@ -0,0 +1,13 @@ +// Represents the RssMeta struct, containing the metadata with some fns to +// access it easily. + +struct RssMeta +{ + // TODO: add the magic. + + // filename: either just name or fully qualified should work. + string title_of(string filename) const; + string link_of(string filename) const; + string pubdate_of(string filename) const; + string guid_of(string filename) const; +} |