about summary refs log tree commit diff stats
path: root/src
diff options
context:
space:
mode:
authorAlan Pearce2014-10-11 21:17:21 +0100
committerAlan Pearce2014-10-11 21:17:21 +0100
commit4f91f0d7e5ef93add2576a372dde441e278c8f5b (patch)
tree8a8a7da25ca262f7ccb7349aba8aac951bd630cf /src
parente81f922f1522030ac801c9f27b8732f0e81d2e08 (diff)
downloadmicroformats-4f91f0d7e5ef93add2576a372dde441e278c8f5b.tar.lz
microformats-4f91f0d7e5ef93add2576a372dde441e278c8f5b.tar.zst
microformats-4f91f0d7e5ef93add2576a372dde441e278c8f5b.zip
Use tree-walking for property parsing
Diffstat (limited to 'src')
-rw-r--r--src/microformats/parser.clj144
1 files changed, 77 insertions, 67 deletions
diff --git a/src/microformats/parser.clj b/src/microformats/parser.clj
index 8ae5665..9bc9d84 100644
--- a/src/microformats/parser.clj
+++ b/src/microformats/parser.clj
@@ -188,45 +188,59 @@
     (list {:html (apply str (node-to-html content))
            :value (apply str (node-to-text content))})))
 
-(defn parse-p
-  "Parse p-* classes within HTML element."
-  [loc]
-  (->> loc
-       z/node
-       element-to-classes
-       ((classes-to-props "p-"))
-       (r/map #(hash-map % (list (get-p-value loc))))
-       (into {})))
+(declare continue-walking)
 
-(defn parse-u
-  "Parse u-* classes within HTML element"
-  [loc]
-  (->> loc
-       z/node
-       element-to-classes
-       ((classes-to-props "u-"))
-       (r/map #(hash-map % (list (get-u-value loc))))
-       (into {})))
+(defn gen-property-parser
+  "Create a property parser"
+  [f]
+  (fn [loc]
+    (apply (partial merge-with concat)
+           (f loc)
+           (continue-walking loc))))
 
-(defn parse-dt
+(def parse-p
+  "Parse p-* classes within HTML element."
+  (gen-property-parser
+   (fn [loc]
+     (->> loc
+          z/node
+          element-to-classes
+          ((classes-to-props "p-"))
+          (r/map #(hash-map % (list (get-p-value loc))))
+          (into {})))))
+
+(def parse-u
+  "Parse u-* classes within HTML element"
+  (gen-property-parser
+   (fn [loc]
+     (->> loc
+          z/node
+          element-to-classes
+          ((classes-to-props "u-"))
+          (r/map #(hash-map % (list (get-u-value loc))))
+          (into {})))))
+
+(def parse-dt
   "Parse dt-* classes within HTML element"
-  [loc]
-  (->> loc
-       z/node
-       element-to-classes
-       ((classes-to-props "dt-"))
-       (r/map #(hash-map % (list (get-dt-value loc))))
-       (into {})))
-
-(defn parse-e
+  (gen-property-parser
+   (fn [loc]
+     (->> loc
+          z/node
+          element-to-classes
+          ((classes-to-props "dt-"))
+          (r/map #(hash-map % (list (get-dt-value loc))))
+          (into {})))))
+
+(def parse-e
   "Parse e-* classes within HTML element"
-  [loc]
-  (->> loc
-       z/node
-       element-to-classes
-       ((classes-to-props "e-"))
-       (r/map #(hash-map % (get-e-value loc)))
-       (into {})))
+  (gen-property-parser
+   (fn [loc]
+     (->> loc
+          z/node
+          element-to-classes
+          ((classes-to-props "e-"))
+          (r/map #(hash-map % (get-e-value loc)))
+          (into {})))))
 
 (defn- get-mf-names
   "Get the microformat names from an element"
@@ -292,41 +306,37 @@
                     :url (list (parse-implied-url loc))
                     :photo (list (parse-implied-photo loc))})))
 
-(defn- select-p
-  [element] (zip-select element [[(html/but-node #{:br :hr}) (html/attr-contains :class "p-")]]))
-
-(defn- select-u
-  [element] (zip-select element [[(html/but-node #{:br :hr}) (html/attr-contains :class "u-")]]))
-
-(defn- select-dt
-  [element] (zip-select element [[(html/but-node #{:br :hr}) (html/attr-contains :class "dt-")]]))
-
-(defn- select-e
-  [element] (zip-select element [[(html/but-node #{:br :hr}) (html/attr-contains :class "e-")]]))
-
-(defn get-mf-properties
-  "Parse children of a microformat, returning a map of properties"
-  [loc]
-  (let [cappend (partial merge-with concat)]
-    (merge (parse-implied loc)
-           (apply cappend (map parse-p (select-p loc)))
-           (apply cappend (map parse-u (select-u loc)))
-           (apply cappend (map parse-dt (select-dt loc)))
-           (apply cappend (map parse-e (select-e loc))))))
-
 (defn parse-h
   "Parse h-* classes within a HTML element."
   [loc]
   (hash-map :type (get-mf-names (z/node loc))
-            :properties (get-mf-properties loc)))
-
-(defn map-h
-  "Map fn to top-level h-* elements within a HTML element."
-  [fn loc]
-  (if (some->> loc z/node :attrs :class (re-matcher #"(?:^|\s) h-\w"))
-    (list (fn loc))
-    (when (not (z/end? loc))
-      (recur fn (z/next loc)))))
+            :properties (apply merge (parse-implied loc)
+                               (continue-walking loc))))
+
+(defn parse-mf
+  "Parse microformats within a HTML element."
+  [loc mf-type]
+  (case mf-type
+    "h" (parse-h loc)
+    "p" (parse-p loc)
+    "u" (parse-u loc)
+    "dt" (parse-dt loc)
+    "e" (parse-e loc)))
+
+(defn walk
+  "Walk HTML element tree for microformat properties."
+  [loc]
+  (when (and (not (z/end? loc))
+             (not (contains? #{:br :hr} (-> loc z/node :tag))))
+    (if-let [class-groups (some->> loc z/node :attrs :class (re-seq #"(?:^|\s)(h|p|u|dt|e)-\w+") (map second) distinct first list)]
+      (map (partial parse-mf loc) class-groups)
+      (recur (z/next loc)))))
+
+(defn continue-walking
+  "Keep walking that tree"
+  [loc]
+  (when (not (z/end? loc))
+    (walk (z/next loc))))
 
 (defn parse-rel
   "Parse rel attributes of an HTML link element"
@@ -351,4 +361,4 @@
   "Parse a HTML string with microformats"
   [html]
   (let [document (first (map z/xml-zip (html/html-snippet (str/trim html))))]
-    {:items (some->> document (map-h parse-h)) :rels (parse-rels document)}))
+    {:items (some-> document walk) :rels (parse-rels document)}))