From 49ed30908feb17d4ca6ea5dd2536b7cf79e395d1 Mon Sep 17 00:00:00 2001 From: Alan Pearce Date: Fri, 10 Oct 2014 16:12:53 +0100 Subject: Use zippers all the way down Includes an ugly "workaround" for interface differences in enlive between select and zip-select --- src/microformats/parser.clj | 251 ++++++++++++++++++++++++-------------------- 1 file changed, 137 insertions(+), 114 deletions(-) (limited to 'src') diff --git a/src/microformats/parser.clj b/src/microformats/parser.clj index 61c1941..e29ca9b 100644 --- a/src/microformats/parser.clj +++ b/src/microformats/parser.clj @@ -5,6 +5,17 @@ [clojure.string :as str] [clojurewerkz.urly.core :as url])) +(defn- as-locs + [loc] + (if (and (vector? loc) (= (count loc) 2) (not (vector? (first loc)))) + [loc] + loc)) + +(defn- zip-select + "Workaround https://github.com/cgrand/enlive/issues/117" + [loc-or-locs selector] + (html/zip-select (as-locs loc-or-locs) (list (@#'html/automaton selector)))) + (defmacro anacond [& clauses] (when clauses @@ -97,106 +108,114 @@ (declare parse-h) -(defn get-child-mf-properties - [element] - (assoc (parse-h element) :value (-> element :content node-to-text))) - (defn remove-property-classes [element] (into {} (html/transform (list element) [html/root] (apply html/remove-class (filter (prefixed-by? "p-") (element-to-classes element)))))) +(defn get-child-mf-properties + [loc] + (assoc (parse-h (z/edit loc remove-property-classes)) :value (-> loc z/node :content node-to-text))) (defn- find-child-mf "Find child property microformats of an element." - [element] - (when (-> element :attrs :class (.indexOf "h-") (>= 0)) - (-> element remove-property-classes get-child-mf-properties))) + [loc] + (let [element (z/node loc)] + (when (-> element :attrs :class (.indexOf "h-") (>= 0)) + (get-child-mf-properties loc)))) (defn get-p-value "Get the p-x property value of an element" - [el] - (or (find-child-mf el) - (str/trim (or (find-value-class el) - (case (:tag el) - :img (-> el :attrs :alt) - :area (-> el :attrs :alt) - :abbr (-> el :attrs :title) - :data (-> el :attrs :value) - :input (-> el :attrs :value) - nil) - (node-to-text (:content el)) - "")))) + [loc] + (let [el (z/node loc)] + (or (find-child-mf loc) + (str/trim (or (find-value-class el) + (case (:tag el) + :img (-> el :attrs :alt) + :area (-> el :attrs :alt) + :abbr (-> el :attrs :title) + :data (-> el :attrs :value) + :input (-> el :attrs :value) + nil) + (node-to-text (:content el)) + ""))))) (defn get-u-value "Get the u-x property value of an element" - [el] - (str/trim (or (find-value-class el) - (case (:tag el) - :a (-> el :attrs :href) - :area (-> el :attrs :href) - :img (-> el :attrs :src) - :object (-> el :attrs :data) - (get-p-value el)) - (node-to-text (:content el)) - ""))) + [loc] + (let [el (z/node loc)] + (str/trim (or (find-value-class el) + (case (:tag el) + :a (-> el :attrs :href) + :area (-> el :attrs :href) + :img (-> el :attrs :src) + :object (-> el :attrs :data) + (get-p-value loc)) + (node-to-text (:content el)) + "")))) (defn get-dt-value "Get the dt-x property value of an element" - [el] - (str/trim (or (find-value-class el) - (case (:tag el) - :time (-> el :attrs :datetime) - :ins (-> el :attrs :datetime) - :del (-> el :attrs :datetime) - :abbr (-> el :attrs :title) - :data (-> el :attrs :value) - :input (-> el :attrs :value) - nil) - (node-to-text (:content el)) - ""))) + [loc] + (let [el (z/node loc)] + (str/trim (or (find-value-class el) + (case (:tag el) + :time (-> el :attrs :datetime) + :ins (-> el :attrs :datetime) + :del (-> el :attrs :datetime) + :abbr (-> el :attrs :title) + :data (-> el :attrs :value) + :input (-> el :attrs :value) + nil) + (node-to-text (:content el)) + "")))) (defn get-e-value "Get the e-x propery value of an element" - [el] - (let [content (:content el)] + [loc] + (let [el (z/node loc) + content (:content el)] (list {:html (apply str (node-to-html content)) :value (apply str (node-to-text content))}))) (defn parse-p "Parse p-* classes within HTML element." - [element] - (->> element + [loc] + (->> loc + z/node element-to-classes ((classes-to-props "p-")) - (r/map #(hash-map % (list (get-p-value element)))) + (r/map #(hash-map % (list (get-p-value loc)))) (into {}))) (defn parse-u "Parse u-* classes within HTML element" - [element] - (->> element + [loc] + (->> loc + z/node element-to-classes ((classes-to-props "u-")) - (r/map #(hash-map % (list (get-u-value element)))) + (r/map #(hash-map % (list (get-u-value loc)))) (into {}))) (defn parse-dt "Parse dt-* classes within HTML element" - [element] - (->> element + [loc] + (->> loc + z/node element-to-classes ((classes-to-props "dt-")) - (r/map #(hash-map % (list (get-dt-value element)))) + (r/map #(hash-map % (list (get-dt-value loc)))) (into {}))) (defn parse-e "Parse e-* classes within HTML element" - [element] - (->> element + [loc] + (->> loc + z/node element-to-classes ((classes-to-props "e-")) - (r/map #(hash-map % (get-e-value element))) + (r/map #(hash-map % (get-e-value loc))) (into {}))) (defn- get-mf-names @@ -208,111 +227,115 @@ (defn- parse-implied-name "Get the implied name of an entity" - [element] - (case (:tag element) - :abbr (-> element :attrs :title) - :img (-> element :attrs :alt) - (anacond - (first (html/select element [html/root :> [:img html/only-child]])) - (-> % :attrs :alt) - (first (html/select element [html/root :> [:abbr html/only-child (html/attr? :title)]])) - (-> % :attrs :title) - (first (html/select element [html/root :> html/only-child :> [:img html/only-child]])) - (-> % :attrs :alt) - (first (html/select element [html/root :> html/only-child :> [:abbr html/only-child (html/attr? :title)]])) - (-> % :attrs :title) - true (node-to-text (:content element))))) + [loc] + (let [element (z/node loc)] + (case (:tag element) + :abbr (-> element :attrs :title) + :img (-> element :attrs :alt) + (anacond + (first (html/select element [html/root :> [:img html/only-child]])) + (-> % :attrs :alt) + (first (html/select element [html/root :> [:abbr html/only-child (html/attr? :title)]])) + (-> % :attrs :title) + (first (html/select element [html/root :> html/only-child :> [:img html/only-child]])) + (-> % :attrs :alt) + (first (html/select element [html/root :> html/only-child :> [:abbr html/only-child (html/attr? :title)]])) + (-> % :attrs :title) + true (node-to-text (:content element)))))) (defn- parse-implied-url - [element] - (case (:tag element) - :a (-> element :attrs :href) - (if-let [% (first (html/select element [html/root :> [:a (html/attr? :href) html/only-of-type (html/but-node (html/attr-contains :class "h-"))]]))] - (-> % :attrs :href)))) + [loc] + (let [element (z/node loc)] + (case (:tag element) + :a (-> element :attrs :href) + (if-let [% (first (html/select element [html/root :> [:a (html/attr? :href) html/only-of-type (html/but-node (html/attr-contains :class "h-"))]]))] + (-> % :attrs :href))))) (defn- parse-implied-photo - [element] - (case (:tag element) - :img (-> element :attrs :src) - :object (-> element :attrs :data) - (anacond - (first (html/select element [html/root :> [:img (html/but-node (html/attr-contains :class "h-")) html/only-of-type]])) - (-> % :attrs :src) - (first (html/select element [html/root :> [:object (html/but-node (html/attr-contains :class "h-")) html/only-of-type]])) - (-> % :attrs :data) - (first (html/select element [html/root :> html/only-child :> [:img (html/but-node (html/attr-contains :class "h-")) html/only-of-type]])) - (-> % :attrs :src) - (first (html/select element [html/root :> html/only-child :> [:object (html/but-node (html/attr-contains :class "h-")) html/only-of-type]])) - (-> % :attrs :data) - ))) + [loc] + (let [element (z/node loc)] + (case (:tag element) + :img (-> element :attrs :src) + :object (-> element :attrs :data) + (anacond + (first (html/select element [html/root :> [:img (html/but-node (html/attr-contains :class "h-")) html/only-of-type]])) + (-> % :attrs :src) + (first (html/select element [html/root :> [:object (html/but-node (html/attr-contains :class "h-")) html/only-of-type]])) + (-> % :attrs :data) + (first (html/select element [html/root :> html/only-child :> [:img (html/but-node (html/attr-contains :class "h-")) html/only-of-type]])) + (-> % :attrs :src) + (first (html/select element [html/root :> html/only-child :> [:object (html/but-node (html/attr-contains :class "h-")) html/only-of-type]])) + (-> % :attrs :data) + )))) (def empty-ish #(not (str/blank? (first (second %))))) (defn parse-implied "Parse implied properties of a HTML element" - [element] + [loc] (into {} (filter empty-ish - {:name (list (parse-implied-name element)) - :url (list (parse-implied-url element)) - :photo (list (parse-implied-photo element))}))) + {:name (list (parse-implied-name loc)) + :url (list (parse-implied-url loc)) + :photo (list (parse-implied-photo loc))}))) (defn- select-p - [element] (html/select element [[(html/but-node #{:br :hr}) (html/attr-contains :class "p-")]])) + [element] (zip-select element [[(html/but-node #{:br :hr}) (html/attr-contains :class "p-")]])) (defn- select-u - [element] (html/select element [[(html/but-node #{:br :hr}) (html/attr-contains :class "u-")]])) + [element] (zip-select element [[(html/but-node #{:br :hr}) (html/attr-contains :class "u-")]])) (defn- select-dt - [element] (html/select element [[(html/but-node #{:br :hr}) (html/attr-contains :class "dt-")]])) + [element] (zip-select element [[(html/but-node #{:br :hr}) (html/attr-contains :class "dt-")]])) (defn- select-e - [element] (html/select element [[(html/but-node #{:br :hr}) (html/attr-contains :class "e-")]])) + [element] (zip-select element [[(html/but-node #{:br :hr}) (html/attr-contains :class "e-")]])) (defn get-mf-properties "Parse children of a microformat, returning a map of properties" - [element] + [loc] (let [cappend (partial merge-with concat)] - (merge (parse-implied element) - (apply cappend (map parse-p (select-p element))) - (apply cappend (map parse-u (select-u element))) - (apply cappend (map parse-dt (select-dt element))) - (apply cappend (map parse-e (select-e element)))))) + (merge (parse-implied loc) + (apply cappend (map parse-p (select-p loc))) + (apply cappend (map parse-u (select-u loc))) + (apply cappend (map parse-dt (select-dt loc))) + (apply cappend (map parse-e (select-e loc)))))) (defn parse-h "Parse h-* classes within a HTML element." - [element] - (hash-map :type (get-mf-names element) - :properties (get-mf-properties element))) + [loc] + (hash-map :type (get-mf-names (z/node loc)) + :properties (get-mf-properties loc))) (defn map-h "Map fn to top-level h-* elements within a HTML element." [fn loc] (if (some->> loc z/node :attrs :class (re-matcher #"(?:^|\s) h-\w")) - (list (fn (z/node loc))) + (list (fn loc)) (when (not (z/end? loc)) (recur fn (z/next loc))))) (defn parse-rel "Parse rel attributes of an HTML link element" - [element] - (->> element + [loc] + (->> loc + z/node element-to-rels (map keyword) - (map #(hash-map % [(-> element :attrs :href)])) + (map #(hash-map % [(-> loc z/node :attrs :href)])) (into {}))) (defn select-rels "Select linking HTML elements with rel attributes" - [html] (html/select html [[#{:a :link} (html/attr? :rel)]])) + [html] (zip-select html [[#{:a :link} (html/attr? :rel)]])) (defn parse-rels "Parse rel attibutes of a set of HTML link elements" - [elements] - (or (apply merge-with into (map parse-rel (select-rels elements))) {})) + [locs] + (or (apply merge-with into (map parse-rel (select-rels locs))) {})) (defn parse "Parse a HTML string with microformats" [html] - (let [document (html/html-snippet (str/trim html))] - {:items (some->> document first z/xml-zip (map-h parse-h)) :rels (parse-rels document)})) + (let [document (first (map z/xml-zip (html/html-snippet (str/trim html))))] + {:items (some->> document (map-h parse-h)) :rels (parse-rels document)})) -- cgit 1.4.1