From fe2e6c08a4f8d42317fe8e97a62e061209d4ed4d Mon Sep 17 00:00:00 2001 From: Alan Pearce Date: Sun, 19 Oct 2014 11:03:51 +0100 Subject: Re-implement base url parsing Attach metadata to root nodes and pass it down when walking child nodes. --- src/microformats/parser.clj | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/src/microformats/parser.clj b/src/microformats/parser.clj index 7cb3062..21fcc26 100644 --- a/src/microformats/parser.clj +++ b/src/microformats/parser.clj @@ -81,13 +81,25 @@ (defn get-base-url "Find the base-url of a document." [document] - (or (-> document + (or (some-> document + meta + :base) + (-> document (html/select [:head :> [:base (html/attr? :href)]]) first :attrs :href) "")) +(defn with-base-url + "Attach the base URL of a document as metadata" + ([document] + (with-base-url (get-base-url document) document)) + ([base-url document] + (if (instance? clojure.lang.IObj document) + (vary-meta document assoc :base base-url) + document))) + (defn normalise-url "Normalise a URL" [root url] @@ -132,7 +144,7 @@ "Find child property microformats of an element." [loc] (let [element (z/node loc)] - (when (-> element :attrs :class (.indexOf "h-") (>= 0)) + (when (-> element :attrs :class (.indexOf "h-") (>= 0)) (get-child-mf-properties loc)))) (defn get-p-value @@ -343,16 +355,18 @@ (map (partial parse-mf loc) (single-pass-child types)) (recur (z/next loc))))) -(def map-walk +(defn map-walk + [root] (comp (r/map (partial apply merge)) (r/filter identity) (r/map walk) - (r/map z/xml-zip))) + (r/map z/xml-zip) + (r/map (partial with-base-url (get-base-url root))))) (defn walk-children "Walk through child elements of loc" [loc] - (some->> loc z/children map-walk (into []))) + (some->> loc z/children ((map-walk (z/root loc))) (into []))) (defn parse-rel "Parse rel attributes of an HTML link element" @@ -376,5 +390,5 @@ (defn parse "Parse a HTML string with microformats" [html] - (let [document (first (map z/xml-zip (html/html-snippet (str/trim html))))] + (let [document (some->> html str/trim html/html-snippet with-base-url (map z/xml-zip) first)] {:items (some-> document walk) :rels (parse-rels document)})) -- cgit 1.4.1