diff options
author | Alan Pearce | 2014-10-10 16:23:32 +0100 |
---|---|---|
committer | Alan Pearce | 2014-10-10 16:23:32 +0100 |
commit | 3213c0c46a152e709772f0818a9281f5ce0e1988 (patch) | |
tree | 8aeba85dd1a45b6ae06a01497e993bcd52b754ca | |
parent | 49ed30908feb17d4ca6ea5dd2536b7cf79e395d1 (diff) | |
download | microformats-3213c0c46a152e709772f0818a9281f5ce0e1988.tar.lz microformats-3213c0c46a152e709772f0818a9281f5ce0e1988.tar.zst microformats-3213c0c46a152e709772f0818a9281f5ce0e1988.zip |
Normalise parsed URLs
-rw-r--r-- | src/microformats/parser.clj | 47 | ||||
-rw-r--r-- | test/microformats/parser_expectations.clj | 25 |
2 files changed, 50 insertions, 22 deletions
diff --git a/src/microformats/parser.clj b/src/microformats/parser.clj index e29ca9b..0b26372 100644 --- a/src/microformats/parser.clj +++ b/src/microformats/parser.clj @@ -146,10 +146,10 @@ (let [el (z/node loc)] (str/trim (or (find-value-class el) (case (:tag el) - :a (-> el :attrs :href) - :area (-> el :attrs :href) - :img (-> el :attrs :src) - :object (-> el :attrs :data) + :a (normalise-url (z/root loc) (-> el :attrs :href)) + :area (normalise-url (z/root loc) (-> el :attrs :href)) + :img (normalise-url (z/root loc) (-> el :attrs :src)) + :object (normalise-url (z/root loc) (-> el :attrs :data)) (get-p-value loc)) (node-to-text (:content el)) "")))) @@ -246,27 +246,30 @@ (defn- parse-implied-url [loc] (let [element (z/node loc)] - (case (:tag element) - :a (-> element :attrs :href) - (if-let [% (first (html/select element [html/root :> [:a (html/attr? :href) html/only-of-type (html/but-node (html/attr-contains :class "h-"))]]))] - (-> % :attrs :href))))) + (some->> + (case (:tag element) + :a (-> element :attrs :href) + (if-let [% (first (html/select element [html/root :> [:a (html/attr? :href) html/only-of-type (html/but-node (html/attr-contains :class "h-"))]]))] + (-> % :attrs :href))) + (normalise-url (z/root loc))))) (defn- parse-implied-photo [loc] (let [element (z/node loc)] - (case (:tag element) - :img (-> element :attrs :src) - :object (-> element :attrs :data) - (anacond - (first (html/select element [html/root :> [:img (html/but-node (html/attr-contains :class "h-")) html/only-of-type]])) - (-> % :attrs :src) - (first (html/select element [html/root :> [:object (html/but-node (html/attr-contains :class "h-")) html/only-of-type]])) - (-> % :attrs :data) - (first (html/select element [html/root :> html/only-child :> [:img (html/but-node (html/attr-contains :class "h-")) html/only-of-type]])) - (-> % :attrs :src) - (first (html/select element [html/root :> html/only-child :> [:object (html/but-node (html/attr-contains :class "h-")) html/only-of-type]])) - (-> % :attrs :data) - )))) + (some->> + (case (:tag element) + :img (-> element :attrs :src) + :object (-> element :attrs :data) + (anacond + (first (html/select element [html/root :> [:img (html/but-node (html/attr-contains :class "h-")) html/only-of-type]])) + (-> % :attrs :src) + (first (html/select element [html/root :> [:object (html/but-node (html/attr-contains :class "h-")) html/only-of-type]])) + (-> % :attrs :data) + (first (html/select element [html/root :> html/only-child :> [:img (html/but-node (html/attr-contains :class "h-")) html/only-of-type]])) + (-> % :attrs :src) + (first (html/select element [html/root :> html/only-child :> [:object (html/but-node (html/attr-contains :class "h-")) html/only-of-type]])) + (-> % :attrs :data))) + (normalise-url (z/root loc))))) (def empty-ish #(not (str/blank? (first (second %))))) @@ -322,7 +325,7 @@ z/node element-to-rels (map keyword) - (map #(hash-map % [(-> loc z/node :attrs :href)])) + (map #(hash-map % [(normalise-url (z/root loc) (-> loc z/node :attrs :href))])) (into {}))) (defn select-rels diff --git a/test/microformats/parser_expectations.clj b/test/microformats/parser_expectations.clj index 6ce942e..92e4448 100644 --- a/test/microformats/parser_expectations.clj +++ b/test/microformats/parser_expectations.clj @@ -157,6 +157,11 @@ (expect {:author '("http://example.com/a")} (parse-rels (snippets "<a rel=\"author\" href=\"http://example.com/a\">author a</a>"))) +(expect {:author '("http://example.com/a")} + (parse-rels (snippets " +<html><head><base href=\"http://example.com\"><body> +<a rel=\"author\" href=\"/a\">author a</a>"))) + (expect {:author '("http://example.com/a" "http://example.com/b")} (parse-rels (snippets "<a rel=\"author\" href=\"http://example.com/a\">author a</a> <a rel=\"author\" href=\"http://example.com/b\">author b</a>"))) @@ -276,3 +281,23 @@ <span class=\"p-name\">John Doe</span> <span class=\"p-org h-card h-org\">Example</span> </div>")) + +(expect {:items '({:type ("h-card") :properties {:name ("Example User") :url ("http://example.com/")}}) :rels {}} + (parse "<html><head><base href=\"http://example.com\"></head><body> +<div class=\"h-card\"><a class=\"u-url\" href=\"/\">Example User</a></div></body></html>")) + +(expect {:items '({:type ("h-card") :properties {:name ("Example User") :url ("http://example.com/")}}) :rels {}} + (parse "<html><head><base href=\"http://example.com\"></head><body> +<div class=\"h-card\"><a href=\"/\">Example User</a></div></body></html>")) + +(expect {:items '({:type ("h-card") :properties {:name ("Example User") + :photo ("http://example.com/me.png")}}) :rels {}} + (parse "<html><head><base href=\"http://example.com/\"></head><body> +<div class=\"h-card\"><img alt=\"Example User\" src=\"me.png\"></div> +</body></html>")) + +(expect {:items '({:type ("h-card") :properties {:name ("Example User") + :photo ("http://example.com/me.png")}}) :rels {}} + (parse "<html><head><base href=\"http://example.com/\"></head><body> +<div class=\"h-card\"><img class=\"u-photo\" alt=\"Example User\" src=\"me.png\"></div> +</body></html>")) |