about summary refs log tree commit diff stats
diff options
context:
space:
mode:
authorAlan Pearce2014-10-19 11:03:51 +0100
committerAlan Pearce2014-10-19 11:03:51 +0100
commitfe2e6c08a4f8d42317fe8e97a62e061209d4ed4d (patch)
tree82144bc3d43dded3b1167139d806b4c259914448
parent8899c5515aa3cad313250c82cacc895f8cd89e2f (diff)
downloadmicroformats-fe2e6c08a4f8d42317fe8e97a62e061209d4ed4d.tar.lz
microformats-fe2e6c08a4f8d42317fe8e97a62e061209d4ed4d.tar.zst
microformats-fe2e6c08a4f8d42317fe8e97a62e061209d4ed4d.zip
Re-implement base url parsing
Attach metadata to root nodes and pass it down when walking child nodes.
-rw-r--r--src/microformats/parser.clj26
1 files changed, 20 insertions, 6 deletions
diff --git a/src/microformats/parser.clj b/src/microformats/parser.clj index 7cb3062..21fcc26 100644 --- a/src/microformats/parser.clj +++ b/src/microformats/parser.clj
@@ -81,13 +81,25 @@
81(defn get-base-url 81(defn get-base-url
82 "Find the base-url of a document." 82 "Find the base-url of a document."
83 [document] 83 [document]
84 (or (-> document 84 (or (some-> document
85 meta
86 :base)
87 (-> document
85 (html/select [:head :> [:base (html/attr? :href)]]) 88 (html/select [:head :> [:base (html/attr? :href)]])
86 first 89 first
87 :attrs 90 :attrs
88 :href) 91 :href)
89 "")) 92 ""))
90 93
94(defn with-base-url
95 "Attach the base URL of a document as metadata"
96 ([document]
97 (with-base-url (get-base-url document) document))
98 ([base-url document]
99 (if (instance? clojure.lang.IObj document)
100 (vary-meta document assoc :base base-url)
101 document)))
102
91(defn normalise-url 103(defn normalise-url
92 "Normalise a URL" 104 "Normalise a URL"
93 [root url] 105 [root url]
@@ -132,7 +144,7 @@
132 "Find child property microformats of an element." 144 "Find child property microformats of an element."
133 [loc] 145 [loc]
134 (let [element (z/node loc)] 146 (let [element (z/node loc)]
135 (when (-> element :attrs :class (.indexOf "h-") (>= 0)) 147 (when (-> element :attrs :class (.indexOf "h-") (>= 0))
136 (get-child-mf-properties loc)))) 148 (get-child-mf-properties loc))))
137 149
138(defn get-p-value 150(defn get-p-value
@@ -343,16 +355,18 @@
343 (map (partial parse-mf loc) (single-pass-child types)) 355 (map (partial parse-mf loc) (single-pass-child types))
344 (recur (z/next loc))))) 356 (recur (z/next loc)))))
345 357
346(def map-walk 358(defn map-walk
359 [root]
347 (comp (r/map (partial apply merge)) 360 (comp (r/map (partial apply merge))
348 (r/filter identity) 361 (r/filter identity)
349 (r/map walk) 362 (r/map walk)
350 (r/map z/xml-zip))) 363 (r/map z/xml-zip)
364 (r/map (partial with-base-url (get-base-url root)))))
351 365
352(defn walk-children 366(defn walk-children
353 "Walk through child elements of loc" 367 "Walk through child elements of loc"
354 [loc] 368 [loc]
355 (some->> loc z/children map-walk (into []))) 369 (some->> loc z/children ((map-walk (z/root loc))) (into [])))
356 370
357(defn parse-rel 371(defn parse-rel
358 "Parse rel attributes of an HTML link element" 372 "Parse rel attributes of an HTML link element"
@@ -376,5 +390,5 @@
376(defn parse 390(defn parse
377 "Parse a HTML string with microformats" 391 "Parse a HTML string with microformats"
378 [html] 392 [html]
379 (let [document (first (map z/xml-zip (html/html-snippet (str/trim html))))] 393 (let [document (some->> html str/trim html/html-snippet with-base-url (map z/xml-zip) first)]
380 {:items (some-> document walk) :rels (parse-rels document)})) 394 {:items (some-> document walk) :rels (parse-rels document)}))