summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGeorge Abbott <george@gabbott.dev>2023-10-04 20:20:57 +0100
committerGeorge Abbott <george@gabbott.dev>2023-10-04 20:20:57 +0100
commit08cec360e16615b1ae157e4926e7317d16765f7e (patch)
tree3a03e0a1acf89973de92e7f6f7a08b2a587b7485
parent8e4092347975db7fe90b0895f605bdd79de47410 (diff)
Progress thus far
-rw-r--r--dub.selections.json6
-rw-r--r--source/app.d172
-rw-r--r--source/html.d38
-rw-r--r--source/rssmeta.d13
4 files changed, 204 insertions, 25 deletions
diff --git a/dub.selections.json b/dub.selections.json
new file mode 100644
index 0000000..70f9033
--- /dev/null
+++ b/dub.selections.json
@@ -0,0 +1,6 @@
+{
+ "fileVersion": 1,
+ "versions": {
+ "dxml": "0.4.4"
+ }
+}
diff --git a/source/app.d b/source/app.d
index 3c913c3..97630d2 100644
--- a/source/app.d
+++ b/source/app.d
@@ -56,22 +56,30 @@
*/
+import std.stdio;
+import std.algorithm : canFind, filter, findSplitAfter;
+import std.file : dirEntries, DirEntry, readText, SpanMode;
+import core.stdc.stdlib : exit;
+
+import html;
+import rssmeta;
+
void usage()
{
writeln(
- "reassess: create an RSS feed out of HTML documents.\n"
- "Usage: \n"
- "\treassess DIRECTORY TARGET (FLAGS)\n"
- "DIRECTORY: The directory where the HTML files are sourced.\n"
- "TARGET: The desired target location of the RSS file.\n"
- "Flags:\n"
- "\t-h, --help Display this help message.\n"
- "\t-m, --metafile The path of the metafile if not in DIRECTORY.\n"
- "\t-r, --recursive Whether to recurse DIRECTORY or not.\n"
- "\t-f, --follow Whether to follow symlinks in DIRECTORY or not.\n"
- "\t-o, --order The order to output the RSS entries.\n"
- "\t Options: reverse; forward; alphabetic-by-title.\n"
- "\t Default to `reverse` if unspecified.\n"
+ "reassess: create an RSS feed out of HTML documents.\n" ~
+ "Usage: \n" ~
+ "\treassess DIRECTORY TARGET (FLAGS)\n" ~
+ "DIRECTORY: The directory where the HTML files are sourced.\n" ~
+ "TARGET: The desired target location of the RSS file.\n" ~
+ "Flags:\n" ~
+ "\t-h, --help Display this help message.\n" ~
+ "\t-m, --metafile The path of the metafile if not in DIRECTORY.\n" ~
+ "\t-r, --recursive Whether to recurse DIRECTORY or not.\n" ~
+ "\t-f, --follow Whether to follow symlinks in DIRECTORY or not.\n" ~
+ "\t-o, --order The order to output the RSS entries.\n" ~
+ "\t Options: reverse; forward; alphabetic-by-title.\n" ~
+ "\t Default to `reverse` if unspecified.\n" ~
"\t-c, --config Options to configure functionality. Details TODO\n"
);
exit(0);
@@ -79,6 +87,7 @@ void usage()
+// TODO: make ctor so that pubDateAsUnixTimestamp gets populated.
struct Entry
{
string title;
@@ -91,7 +100,8 @@ private:
int pubDateAsUnixTimestamp; // So that we can sort easily.
}
-Entry make_entry(DirEntry dirEntry)
+
+Entry make_entry(DirEntry dirEntry, RssMeta rssmeta)
{
// From the DirEntry, we can get all file info, and also read the full file
// since we know its path. We just need to remember, when we read the
@@ -100,14 +110,115 @@ Entry make_entry(DirEntry dirEntry)
// escaped ones.
// Or, if <body> does not work, we could try between some comments.
// I say, <!-- bct:beg --> and <!-- bct:end --> would meet those criteria.
-}
+ // COMPLETE
+ string get_title(DirEntry dirEntry, in string text, in RssMeta rssmeta)
+ {
+ // Title is either:
+ // 1) rssmeta:<title>
+ // 2) <!-- rss-title: TITLE --> comment in text.
+ // 3) <title>TITLE</title> from text.
+ string title = rssmeta.title_of(dirEntry.name);
+ if (title !is null)
+ return title;
+ title = find_rss_comment("title", text);
+ if (title !is null)
+ return title;
+ title = find_html_title(text);
+ return title; // even if it's null.
+ }
-import std.stdio;
-import std.algorithm : canFind;
-import std.file : dirEntries, DirEntry;
+ // COMPLETE
+ string get_link(DirEntry dirEntry, in string text, in RssMeta rssmeta)
+ {
+ // Link is either:
+ // 1) rssmeta:<link>
+ // 2) <!-- rss-link: link --> comment in text.
+
+ string link = rssmeta.link_of(dirEntry.name);
+ if (link !is null)
+ return link;
+
+ link = find_rss_comment("link", text);
+ return link;
+ }
+
+ // Output is formatted as ISO-8601.
+ string get_pubdate(DirEntry dirEntry, in string text, in RssMeta rssmeta)
+ {
+ // The date must be formatted YYYY-mm-dd HH:MM.
+ // pubDate is either:
+ // 1) rssmeta:<pubDate>
+ // 2) <!-- rss-pubDate: TITLE --> comment in text.
+ // 3) the Unix timestamp of the file.
+
+ string pubDate = rssmeta.title_of(dirEntry.name);
+ if (pubDate !is null)
+ return pubDate;
+
+ pubDate = find_rss_comment("pubDate", text);
+ if (pubDate !is null)
+ return pubDate;
+
+ // TODO: add in getting date from the dirEntry.time.
+
+ return pubDate; // even if it's null.
+ }
+
+ // COMPLETE
+ string get_guid(DirEntry dirEntry, in string text, in RssMeta rssmeta)
+ {
+ // guid is either:
+ // 1) rssmeta:<guid>
+ // 2) <!-- rss-guid: guid --> comment in text.
+ // 3) delegated to get_link as a last resort.
+
+ string guid = rssmeta.guid_of(dirEntry.name);
+ if (guid !is null)
+ return guid;
+
+ guid = find_rss_comment("guid", text);
+ if (guid !is null)
+ return guid;
+
+ return get_link(dirEntry, text, rssmeta);
+ }
+
+ void fail(string what)
+ {
+ writeln("Error occurred when retrieving attribute ", what, " in make_entry");
+ exit(-1);
+ }
+
+
+ string text = readText(dirEntry.name);
+ string title = get_title(dirEntry, text, rssmeta);
+ string link = get_link(dirEntry, text, rssmeta);
+ string pubDate = get_pubdate(dirEntry, text, rssmeta);
+ string guid = get_guid(dirEntry, text, rssmeta);
+
+ if (text is null)
+ fail("text");
+
+ if (title is null)
+ fail("title");
+
+ if (link is null)
+ fail("link");
+
+ if (pubDate is null)
+ fail("pubDate");
+
+ if (guid is null)
+ fail("guid");
+
+ Entry ret = Entry(text, title, link, pubDate, guid);
+ return ret;
+
+
+}
// Flags which are passed to -i (--ignore).
immutable FILE_WO_ENTRY = "file-wo-entry";
@@ -119,7 +230,7 @@ int main(string[] args)
/* Flags */
string metafile;
bool entries_set = false;
- string entries;
+ string entries_dir;
string target;
string[] ignore;
bool recursive = false;
@@ -137,6 +248,7 @@ int main(string[] args)
else if (args[i] == "-f" || args[i] == "--follow")
follow_symlinks = true;
else if (args[i] == "-o" || args[i] == "--order")
+ {}
else if (args[i] == "-i" || args[i] == "--ignore")
{
auto param = args[++i];
@@ -154,19 +266,25 @@ int main(string[] args)
if (!entries_set)
{
entries_set = true;
- entries = metafile[i];
+ entries_dir = args[i];
}
else
- target = metafile[i];
+ target = args[i];
}
}
+ // Get RssMeta.
+ // TODO: also make it so it can get the metafile from the current dir
+ // and rssmeta.xml xor rssmeta.json files.
+ RssMeta rssmeta = RssMeta(metafile);
+
+
// Grab list of files in directory.
DirEntry[] objects = [];
- foreach (DirEntry file; dirEntries(entries, ".html",
+ foreach (DirEntry file; dirEntries(entries_dir, ".html",
recursive? SpanMode.depth : SpanMode.shallow,
follow_symlinks)
- .filter!(file => f.isFile())
+ .filter!(f => f.isFile())
)
{
objects ~= file;
@@ -177,15 +295,19 @@ int main(string[] args)
Entry[] entries;
foreach (DirEntry dirEntry; objects)
{
- entries ~= make_entry(dirEntry);
+ entries ~= make_entry(dirEntry, rssmeta);
}
// Iterate over all entries in the order as described by -o flag, and
// create the entry for them. This gets spat back out as a big string.
- auto entries = create_entries(...);
+ // auto entries = create_entries(...);
// Now, write all the fluff around the entries, and jobs-almost-agoodun.
// Finally, write back the full RSS XML into the file specified by $2.
+
+
+
+ return 0;
}
diff --git a/source/html.d b/source/html.d
new file mode 100644
index 0000000..768e270
--- /dev/null
+++ b/source/html.d
@@ -0,0 +1,38 @@
+// Represents all functions for process the raw HTML and
+// extracting information from it, transforming it, etc.
+// All fns operate on strings.
+
+import std.algorithm : findSplitAfter, find;
+
+
+// Given the haystack (the full HTML) search for the RSS comment
+// representing that attribute.
+string find_rss_comment(string attr, string haystack)
+in (haystack !is null)
+{
+ immutable string needle = "<!-- rss-" ~ attr ~ ":";
+ string found = haystack.find(needle);
+
+ if (found is null)
+ return null;
+
+ // As otherwise comment begin will be in `found`.
+ found = found[needle.length .. $];
+
+ // And get before the closing comment.
+ string result = found.findSplitAfter("-->")[0];
+ return result;
+}
+
+string find_html_title(string htmltext)
+{
+ // TODO: implement.
+ immutable string init_tag = "<title>";
+ string found = htmltext.find(init_tag);
+ if (found is null)
+ return null;
+
+ found = found[init_tag.length .. $];
+ string result = found.findSplitAfter("</title>")[0];
+ return result;
+}
diff --git a/source/rssmeta.d b/source/rssmeta.d
new file mode 100644
index 0000000..6fec5b4
--- /dev/null
+++ b/source/rssmeta.d
@@ -0,0 +1,13 @@
+// Represents the RssMeta struct, containing the metadata with some fns to
+// access it easily.
+
+struct RssMeta
+{
+ // TODO: add the magic.
+
+ // filename: either just name or fully qualified should work.
+ string title_of(string filename) const;
+ string link_of(string filename) const;
+ string pubdate_of(string filename) const;
+ string guid_of(string filename) const;
+}