about summary refs log tree commit diff stats
diff options
context:
space:
mode:
authorAlan Pearce2014-10-19 11:03:51 +0100
committerAlan Pearce2014-10-19 11:03:51 +0100
commitfe2e6c08a4f8d42317fe8e97a62e061209d4ed4d (patch)
tree82144bc3d43dded3b1167139d806b4c259914448
parent8899c5515aa3cad313250c82cacc895f8cd89e2f (diff)
downloadmicroformats-fe2e6c08a4f8d42317fe8e97a62e061209d4ed4d.tar.lz
microformats-fe2e6c08a4f8d42317fe8e97a62e061209d4ed4d.tar.zst
microformats-fe2e6c08a4f8d42317fe8e97a62e061209d4ed4d.zip
Re-implement base url parsing
Attach metadata to root nodes and pass it down when walking child nodes.
-rw-r--r--src/microformats/parser.clj26
1 files changed, 20 insertions, 6 deletions
diff --git a/src/microformats/parser.clj b/src/microformats/parser.clj
index 7cb3062..21fcc26 100644
--- a/src/microformats/parser.clj
+++ b/src/microformats/parser.clj
@@ -81,13 +81,25 @@
 (defn get-base-url
   "Find the base-url of a document."
   [document]
-  (or (-> document
+  (or (some-> document
+              meta
+              :base)
+      (-> document
           (html/select [:head :> [:base (html/attr? :href)]])
           first
           :attrs
           :href)
       ""))
 
+(defn with-base-url
+  "Attach the base URL of a document as metadata"
+  ([document]
+     (with-base-url (get-base-url document) document))
+  ([base-url document]
+     (if (instance? clojure.lang.IObj document)
+       (vary-meta document assoc :base base-url)
+       document)))
+
 (defn normalise-url
   "Normalise a URL"
   [root url]
@@ -132,7 +144,7 @@
   "Find child property microformats of an element."
   [loc]
   (let [element (z/node loc)]
-   (when (-> element :attrs :class (.indexOf "h-") (>= 0))
+    (when (-> element :attrs :class (.indexOf "h-") (>= 0))
      (get-child-mf-properties loc))))
 
 (defn get-p-value
@@ -343,16 +355,18 @@
       (map (partial parse-mf loc) (single-pass-child types))
       (recur (z/next loc)))))
 
-(def map-walk
+(defn map-walk
+  [root]
   (comp (r/map (partial apply merge))
         (r/filter identity)
         (r/map walk)
-        (r/map z/xml-zip)))
+        (r/map z/xml-zip)
+        (r/map (partial with-base-url (get-base-url root)))))
 
 (defn walk-children
   "Walk through child elements of loc"
   [loc]
-  (some->> loc z/children map-walk (into [])))
+  (some->> loc z/children ((map-walk (z/root loc))) (into [])))
 
 (defn parse-rel
   "Parse rel attributes of an HTML link element"
@@ -376,5 +390,5 @@
 (defn parse
   "Parse a HTML string with microformats"
   [html]
-  (let [document (first (map z/xml-zip (html/html-snippet (str/trim html))))]
+  (let [document (some->> html str/trim html/html-snippet with-base-url (map z/xml-zip) first)]
     {:items (some-> document walk) :rels (parse-rels document)}))