about summary refs log tree commit diff stats
diff options
context:
space:
mode:
-rw-r--r--src/microformats/parser.clj251
-rw-r--r--test/microformats/parser_expectations.clj42
2 files changed, 160 insertions, 133 deletions
diff --git a/src/microformats/parser.clj b/src/microformats/parser.clj
index 61c1941..e29ca9b 100644
--- a/src/microformats/parser.clj
+++ b/src/microformats/parser.clj
@@ -5,6 +5,17 @@
             [clojure.string :as str]
             [clojurewerkz.urly.core :as url]))
 
+(defn- as-locs
+  [loc]
+  (if (and (vector? loc) (= (count loc) 2) (not (vector? (first loc))))
+    [loc]
+    loc))
+
+(defn- zip-select
+  "Workaround https://github.com/cgrand/enlive/issues/117"
+  [loc-or-locs selector]
+  (html/zip-select (as-locs loc-or-locs) (list (@#'html/automaton selector))))
+
 (defmacro anacond
   [& clauses]
   (when clauses
@@ -97,106 +108,114 @@
 
 (declare parse-h)
 
-(defn get-child-mf-properties
-  [element]
-  (assoc (parse-h element) :value (-> element :content node-to-text)))
-
 (defn remove-property-classes
   [element]
   (into {} (html/transform (list element) [html/root]
                            (apply html/remove-class (filter (prefixed-by? "p-")
                                                             (element-to-classes element))))))
+(defn get-child-mf-properties
+  [loc]
+  (assoc (parse-h (z/edit loc remove-property-classes)) :value (-> loc z/node :content node-to-text)))
 
 (defn- find-child-mf
   "Find child property microformats of an element."
-  [element]
-  (when (-> element :attrs :class (.indexOf "h-") (>= 0))
-    (-> element remove-property-classes get-child-mf-properties)))
+  [loc]
+  (let [element (z/node loc)]
+   (when (-> element :attrs :class (.indexOf "h-") (>= 0))
+     (get-child-mf-properties loc))))
 
 (defn get-p-value
   "Get the p-x property value of an element"
-  [el]
-  (or (find-child-mf el)
-      (str/trim (or (find-value-class el)
-                    (case (:tag el)
-                      :img (-> el :attrs :alt)
-                      :area (-> el :attrs :alt)
-                      :abbr (-> el :attrs :title)
-                      :data (-> el :attrs :value)
-                      :input (-> el :attrs :value)
-                      nil)
-                    (node-to-text (:content el))
-                    ""))))
+  [loc]
+  (let [el (z/node loc)]
+    (or (find-child-mf loc)
+        (str/trim (or (find-value-class el)
+                      (case (:tag el)
+                        :img (-> el :attrs :alt)
+                        :area (-> el :attrs :alt)
+                        :abbr (-> el :attrs :title)
+                        :data (-> el :attrs :value)
+                        :input (-> el :attrs :value)
+                        nil)
+                      (node-to-text (:content el))
+                      "")))))
 
 (defn get-u-value
   "Get the u-x property value of an element"
-  [el]
-  (str/trim (or (find-value-class el)
-                (case (:tag el)
-                  :a (-> el :attrs :href)
-                  :area (-> el :attrs :href)
-                  :img (-> el :attrs :src)
-                  :object (-> el :attrs :data)
-                  (get-p-value el))
-            (node-to-text (:content el))
-            "")))
+  [loc]
+  (let [el (z/node loc)]
+    (str/trim (or (find-value-class el)
+                  (case (:tag el)
+                    :a (-> el :attrs :href)
+                    :area (-> el :attrs :href)
+                    :img (-> el :attrs :src)
+                    :object (-> el :attrs :data)
+                    (get-p-value loc))
+                  (node-to-text (:content el))
+                  ""))))
 
 (defn get-dt-value
   "Get the dt-x property value of an element"
-  [el]
-  (str/trim (or (find-value-class el)
-                (case (:tag el)
-                  :time (-> el :attrs :datetime)
-                  :ins  (-> el :attrs :datetime)
-                  :del  (-> el :attrs :datetime)
-                  :abbr (-> el :attrs :title)
-                  :data (-> el :attrs :value)
-                  :input (-> el :attrs :value)
-                  nil)
-                (node-to-text (:content el))
-                "")))
+  [loc]
+  (let [el (z/node loc)]
+    (str/trim (or (find-value-class el)
+                  (case (:tag el)
+                    :time (-> el :attrs :datetime)
+                    :ins  (-> el :attrs :datetime)
+                    :del  (-> el :attrs :datetime)
+                    :abbr (-> el :attrs :title)
+                    :data (-> el :attrs :value)
+                    :input (-> el :attrs :value)
+                    nil)
+                  (node-to-text (:content el))
+                  ""))))
 
 (defn get-e-value
   "Get the e-x propery value of an element"
-  [el]
-  (let [content (:content el)]
+  [loc]
+  (let [el (z/node loc)
+        content (:content el)]
     (list {:html (apply str (node-to-html content))
            :value (apply str (node-to-text content))})))
 
 (defn parse-p
   "Parse p-* classes within HTML element."
-  [element]
-  (->> element
+  [loc]
+  (->> loc
+       z/node
        element-to-classes
        ((classes-to-props "p-"))
-       (r/map #(hash-map % (list (get-p-value element))))
+       (r/map #(hash-map % (list (get-p-value loc))))
        (into {})))
 
 (defn parse-u
   "Parse u-* classes within HTML element"
-  [element]
-  (->> element
+  [loc]
+  (->> loc
+       z/node
        element-to-classes
        ((classes-to-props "u-"))
-       (r/map #(hash-map % (list (get-u-value element))))
+       (r/map #(hash-map % (list (get-u-value loc))))
        (into {})))
 
 (defn parse-dt
   "Parse dt-* classes within HTML element"
-  [element]
-  (->> element
+  [loc]
+  (->> loc
+       z/node
        element-to-classes
        ((classes-to-props "dt-"))
-       (r/map #(hash-map % (list (get-dt-value element))))
+       (r/map #(hash-map % (list (get-dt-value loc))))
        (into {})))
 
 (defn parse-e
   "Parse e-* classes within HTML element"
-  [element]
-  (->> element
+  [loc]
+  (->> loc
+       z/node
        element-to-classes
        ((classes-to-props "e-"))
-       (r/map #(hash-map % (get-e-value element)))
+       (r/map #(hash-map % (get-e-value loc)))
        (into {})))
 
 (defn- get-mf-names
@@ -208,111 +227,115 @@
 
 (defn- parse-implied-name
   "Get the implied name of an entity"
-  [element]
-  (case (:tag element)
-    :abbr (-> element :attrs :title)
-    :img (-> element :attrs :alt)
-    (anacond
-     (first (html/select element [html/root :> [:img html/only-child]]))
-     (-> % :attrs :alt)
-     (first (html/select element [html/root :> [:abbr html/only-child (html/attr? :title)]]))
-     (-> % :attrs :title)
-     (first (html/select element [html/root :> html/only-child :> [:img html/only-child]]))
-     (-> % :attrs :alt)
-     (first (html/select element [html/root :> html/only-child :> [:abbr html/only-child (html/attr? :title)]]))
-     (-> % :attrs :title)
-     true (node-to-text (:content element)))))
+  [loc]
+  (let [element (z/node loc)]
+    (case (:tag element)
+      :abbr (-> element :attrs :title)
+      :img (-> element :attrs :alt)
+      (anacond
+       (first (html/select element [html/root :> [:img html/only-child]]))
+       (-> % :attrs :alt)
+       (first (html/select element [html/root :> [:abbr html/only-child (html/attr? :title)]]))
+       (-> % :attrs :title)
+       (first (html/select element [html/root :> html/only-child :> [:img html/only-child]]))
+       (-> % :attrs :alt)
+       (first (html/select element [html/root :> html/only-child :> [:abbr html/only-child (html/attr? :title)]]))
+       (-> % :attrs :title)
+       true (node-to-text (:content element))))))
 
 (defn- parse-implied-url
-  [element]
-  (case (:tag element)
-    :a (-> element :attrs :href)
-    (if-let [% (first (html/select element [html/root :> [:a (html/attr? :href) html/only-of-type (html/but-node (html/attr-contains :class "h-"))]]))]
-      (-> % :attrs :href))))
+  [loc]
+  (let [element (z/node loc)]
+    (case (:tag element)
+      :a (-> element :attrs :href)
+      (if-let [% (first (html/select element [html/root :> [:a (html/attr? :href) html/only-of-type (html/but-node (html/attr-contains :class "h-"))]]))]
+        (-> % :attrs :href)))))
 
 (defn- parse-implied-photo
-  [element]
-  (case (:tag element)
-    :img (-> element :attrs :src)
-    :object (-> element :attrs :data)
-    (anacond
-     (first (html/select element [html/root :> [:img (html/but-node (html/attr-contains :class "h-")) html/only-of-type]]))
-     (-> % :attrs :src)
-     (first (html/select element [html/root :> [:object (html/but-node (html/attr-contains :class "h-")) html/only-of-type]]))
-     (-> % :attrs :data)
-     (first (html/select element [html/root :> html/only-child :> [:img (html/but-node (html/attr-contains :class "h-")) html/only-of-type]]))
-     (-> % :attrs :src)
-     (first (html/select element [html/root :> html/only-child :> [:object (html/but-node (html/attr-contains :class "h-")) html/only-of-type]]))
-     (-> % :attrs :data)
-     )))
+  [loc]
+  (let [element (z/node loc)]
+    (case (:tag element)
+      :img (-> element :attrs :src)
+      :object (-> element :attrs :data)
+      (anacond
+       (first (html/select element [html/root :> [:img (html/but-node (html/attr-contains :class "h-")) html/only-of-type]]))
+       (-> % :attrs :src)
+       (first (html/select element [html/root :> [:object (html/but-node (html/attr-contains :class "h-")) html/only-of-type]]))
+       (-> % :attrs :data)
+       (first (html/select element [html/root :> html/only-child :> [:img (html/but-node (html/attr-contains :class "h-")) html/only-of-type]]))
+       (-> % :attrs :src)
+       (first (html/select element [html/root :> html/only-child :> [:object (html/but-node (html/attr-contains :class "h-")) html/only-of-type]]))
+       (-> % :attrs :data)
+       ))))
 
 (def empty-ish
   #(not (str/blank? (first (second %)))))
 
 (defn parse-implied
   "Parse implied properties of a HTML element"
-  [element]
+  [loc]
   (into {} (filter empty-ish
-                   {:name (list (parse-implied-name element))
-                    :url (list (parse-implied-url element))
-                    :photo (list (parse-implied-photo element))})))
+                   {:name (list (parse-implied-name loc))
+                    :url (list (parse-implied-url loc))
+                    :photo (list (parse-implied-photo loc))})))
 
 (defn- select-p
-  [element] (html/select element [[(html/but-node #{:br :hr}) (html/attr-contains :class "p-")]]))
+  [element] (zip-select element [[(html/but-node #{:br :hr}) (html/attr-contains :class "p-")]]))
 
 (defn- select-u
-  [element] (html/select element [[(html/but-node #{:br :hr}) (html/attr-contains :class "u-")]]))
+  [element] (zip-select element [[(html/but-node #{:br :hr}) (html/attr-contains :class "u-")]]))
 
 (defn- select-dt
-  [element] (html/select element [[(html/but-node #{:br :hr}) (html/attr-contains :class "dt-")]]))
+  [element] (zip-select element [[(html/but-node #{:br :hr}) (html/attr-contains :class "dt-")]]))
 
 (defn- select-e
-  [element] (html/select element [[(html/but-node #{:br :hr}) (html/attr-contains :class "e-")]]))
+  [element] (zip-select element [[(html/but-node #{:br :hr}) (html/attr-contains :class "e-")]]))
 
 (defn get-mf-properties
   "Parse children of a microformat, returning a map of properties"
-  [element]
+  [loc]
   (let [cappend (partial merge-with concat)]
-    (merge (parse-implied element)
-           (apply cappend (map parse-p (select-p element)))
-           (apply cappend (map parse-u (select-u element)))
-           (apply cappend (map parse-dt (select-dt element)))
-           (apply cappend (map parse-e (select-e element))))))
+    (merge (parse-implied loc)
+           (apply cappend (map parse-p (select-p loc)))
+           (apply cappend (map parse-u (select-u loc)))
+           (apply cappend (map parse-dt (select-dt loc)))
+           (apply cappend (map parse-e (select-e loc))))))
 
 (defn parse-h
   "Parse h-* classes within a HTML element."
-  [element]
-  (hash-map :type (get-mf-names element)
-            :properties (get-mf-properties element)))
+  [loc]
+  (hash-map :type (get-mf-names (z/node loc))
+            :properties (get-mf-properties loc)))
 
 (defn map-h
   "Map fn to top-level h-* elements within a HTML element."
   [fn loc]
   (if (some->> loc z/node :attrs :class (re-matcher #"(?:^|\s) h-\w"))
-    (list (fn (z/node loc)))
+    (list (fn loc))
     (when (not (z/end? loc))
       (recur fn (z/next loc)))))
 
 (defn parse-rel
   "Parse rel attributes of an HTML link element"
-  [element]
-  (->> element
+  [loc]
+  (->> loc
+       z/node
        element-to-rels
        (map keyword)
-       (map #(hash-map % [(-> element :attrs :href)]))
+       (map #(hash-map % [(-> loc z/node :attrs :href)]))
        (into {})))
 
 (defn select-rels
   "Select linking HTML elements with rel attributes"
-  [html] (html/select html [[#{:a :link} (html/attr? :rel)]]))
+  [html] (zip-select html [[#{:a :link} (html/attr? :rel)]]))
 
 (defn parse-rels
   "Parse rel attibutes of a set of HTML link elements"
-  [elements]
-  (or (apply merge-with into (map parse-rel (select-rels elements))) {}))
+  [locs]
+  (or (apply merge-with into (map parse-rel (select-rels locs))) {}))
 
 (defn parse
   "Parse a HTML string with microformats"
   [html]
-  (let [document (html/html-snippet (str/trim html))]
-    {:items (some->> document first z/xml-zip (map-h parse-h)) :rels (parse-rels document)}))
+  (let [document (first (map z/xml-zip (html/html-snippet (str/trim html))))]
+    {:items (some->> document (map-h parse-h)) :rels (parse-rels document)}))
diff --git a/test/microformats/parser_expectations.clj b/test/microformats/parser_expectations.clj
index 0e72e10..6ce942e 100644
--- a/test/microformats/parser_expectations.clj
+++ b/test/microformats/parser_expectations.clj
@@ -4,9 +4,13 @@
             [clojure.zip :as z]
             [net.cgrand.enlive-html :refer [html-snippet]]))
 
+(defn- snippets
+  [html]
+  (map z/xml-zip (html-snippet html)))
+
 (defn- snippet
   [html]
-  (first (html-snippet html)))
+  (first (snippets html)))
 
 (expect [:location]
         (into [] ((classes-to-props "p-") ["someclass" "p-location" "someotherclass"])))
@@ -22,34 +26,34 @@
 <span class=\"p-street-address\">665 3rd St.</span>
 <span class=\"p-extended-address\">Suite 207</span>
 </p>"
-             snippet :content
+             snippet z/node :content
              (#'microformats.parser/node-to-text)))
 
 (expect '({:tag :div :attrs {:class "h-card"}
            :content nil})
-        (map-h identity (z/xml-zip (snippet
-                                    "<div class=\"h-card\"></div>"))))
+        (map-h z/node (snippet
+                       "<div class=\"h-card\"></div>")))
 
 (expect '({:tag :div :attrs {:class "h-card"}
            :content nil})
-        (map-h identity (z/xml-zip (snippet
-                                    "<header><div class=\"h-card\"></div></header>"))))
+        (map-h z/node (snippet
+                       "<header><div class=\"h-card\"></div></header>")))
 
 (expect '({:tag :div :attrs {:class "h-card"}
            :content ({:tag :a :attrs {:class "h-org"}
                       :content nil})})
-        (map-h identity (z/xml-zip (snippet
-                                    "<div class=\"h-card\"><a class=\"h-org\"></a></div>"))))
-
-(expect '({:tag :div :attrs {:class "h-card"}
-           :content ("\n"
-                     {:tag :p :attrs nil
-                      :content ({:tag :a :attrs {:class "h-org"}
-                                 :content nil})}
-                     "\n")})
-        (map-h identity (z/xml-zip (snippet "<div class=\"h-card\">
+        (map-h z/node (snippet
+                       "<div class=\"h-card\"><a class=\"h-org\"></a></div>")))
+
+(expect {:tag :div :attrs {:class "h-card"}
+                 :content ["\n"
+                           {:tag :p :attrs nil
+                            :content [{:tag :a :attrs {:class "h-org"}
+                                       :content nil}]}
+                           "\n"]}
+                (z/node (snippet "<div class=\"h-card\">
 <p><a class=\"h-org\"></a></p>
-</div>"))))
+</div>")))
 
 (expect "http://example.com"
         (get-base-url (snippet "<head><base href=\"http://example.com\"></head>")))
@@ -151,10 +155,10 @@
         (parse-e (snippet "<div class=\"e-content\">Here is a load of <strong>embedded markup</strong></div>")))
 
 (expect {:author '("http://example.com/a")}
-        (parse-rels (html-snippet "<a rel=\"author\" href=\"http://example.com/a\">author a</a>")))
+        (parse-rels (snippets "<a rel=\"author\" href=\"http://example.com/a\">author a</a>")))
 
 (expect {:author '("http://example.com/a" "http://example.com/b")}
-        (parse-rels (html-snippet "<a rel=\"author\" href=\"http://example.com/a\">author a</a>
+        (parse-rels (snippets "<a rel=\"author\" href=\"http://example.com/a\">author a</a>
 <a rel=\"author\" href=\"http://example.com/b\">author b</a>")))
 
 (expect {:items nil :rels {}}