diff options
author | Alan Pearce | 2014-10-19 11:03:51 +0100 |
---|---|---|
committer | Alan Pearce | 2014-10-19 11:03:51 +0100 |
commit | fe2e6c08a4f8d42317fe8e97a62e061209d4ed4d (patch) | |
tree | 82144bc3d43dded3b1167139d806b4c259914448 | |
parent | 8899c5515aa3cad313250c82cacc895f8cd89e2f (diff) | |
download | microformats-fe2e6c08a4f8d42317fe8e97a62e061209d4ed4d.tar.lz microformats-fe2e6c08a4f8d42317fe8e97a62e061209d4ed4d.tar.zst microformats-fe2e6c08a4f8d42317fe8e97a62e061209d4ed4d.zip |
Re-implement base url parsing
Attach metadata to root nodes and pass it down when walking child nodes.
-rw-r--r-- | src/microformats/parser.clj | 26 |
1 files changed, 20 insertions, 6 deletions
diff --git a/src/microformats/parser.clj b/src/microformats/parser.clj index 7cb3062..21fcc26 100644 --- a/src/microformats/parser.clj +++ b/src/microformats/parser.clj | |||
@@ -81,13 +81,25 @@ | |||
81 | (defn get-base-url | 81 | (defn get-base-url |
82 | "Find the base-url of a document." | 82 | "Find the base-url of a document." |
83 | [document] | 83 | [document] |
84 | (or (-> document | 84 | (or (some-> document |
85 | meta | ||
86 | :base) | ||
87 | (-> document | ||
85 | (html/select [:head :> [:base (html/attr? :href)]]) | 88 | (html/select [:head :> [:base (html/attr? :href)]]) |
86 | first | 89 | first |
87 | :attrs | 90 | :attrs |
88 | :href) | 91 | :href) |
89 | "")) | 92 | "")) |
90 | 93 | ||
94 | (defn with-base-url | ||
95 | "Attach the base URL of a document as metadata" | ||
96 | ([document] | ||
97 | (with-base-url (get-base-url document) document)) | ||
98 | ([base-url document] | ||
99 | (if (instance? clojure.lang.IObj document) | ||
100 | (vary-meta document assoc :base base-url) | ||
101 | document))) | ||
102 | |||
91 | (defn normalise-url | 103 | (defn normalise-url |
92 | "Normalise a URL" | 104 | "Normalise a URL" |
93 | [root url] | 105 | [root url] |
@@ -132,7 +144,7 @@ | |||
132 | "Find child property microformats of an element." | 144 | "Find child property microformats of an element." |
133 | [loc] | 145 | [loc] |
134 | (let [element (z/node loc)] | 146 | (let [element (z/node loc)] |
135 | (when (-> element :attrs :class (.indexOf "h-") (>= 0)) | 147 | (when (-> element :attrs :class (.indexOf "h-") (>= 0)) |
136 | (get-child-mf-properties loc)))) | 148 | (get-child-mf-properties loc)))) |
137 | 149 | ||
138 | (defn get-p-value | 150 | (defn get-p-value |
@@ -343,16 +355,18 @@ | |||
343 | (map (partial parse-mf loc) (single-pass-child types)) | 355 | (map (partial parse-mf loc) (single-pass-child types)) |
344 | (recur (z/next loc))))) | 356 | (recur (z/next loc))))) |
345 | 357 | ||
346 | (def map-walk | 358 | (defn map-walk |
359 | [root] | ||
347 | (comp (r/map (partial apply merge)) | 360 | (comp (r/map (partial apply merge)) |
348 | (r/filter identity) | 361 | (r/filter identity) |
349 | (r/map walk) | 362 | (r/map walk) |
350 | (r/map z/xml-zip))) | 363 | (r/map z/xml-zip) |
364 | (r/map (partial with-base-url (get-base-url root))))) | ||
351 | 365 | ||
352 | (defn walk-children | 366 | (defn walk-children |
353 | "Walk through child elements of loc" | 367 | "Walk through child elements of loc" |
354 | [loc] | 368 | [loc] |
355 | (some->> loc z/children map-walk (into []))) | 369 | (some->> loc z/children ((map-walk (z/root loc))) (into []))) |
356 | 370 | ||
357 | (defn parse-rel | 371 | (defn parse-rel |
358 | "Parse rel attributes of an HTML link element" | 372 | "Parse rel attributes of an HTML link element" |
@@ -376,5 +390,5 @@ | |||
376 | (defn parse | 390 | (defn parse |
377 | "Parse a HTML string with microformats" | 391 | "Parse a HTML string with microformats" |
378 | [html] | 392 | [html] |
379 | (let [document (first (map z/xml-zip (html/html-snippet (str/trim html))))] | 393 | (let [document (some->> html str/trim html/html-snippet with-base-url (map z/xml-zip) first)] |
380 | {:items (some-> document walk) :rels (parse-rels document)})) | 394 | {:items (some-> document walk) :rels (parse-rels document)})) |