From 4f91f0d7e5ef93add2576a372dde441e278c8f5b Mon Sep 17 00:00:00 2001 From: Alan Pearce Date: Sat, 11 Oct 2014 21:17:21 +0100 Subject: Use tree-walking for property parsing --- src/microformats/parser.clj | 144 +++++++++++++++++++++++--------------------- 1 file changed, 77 insertions(+), 67 deletions(-) (limited to 'src') diff --git a/src/microformats/parser.clj b/src/microformats/parser.clj index 8ae5665..9bc9d84 100644 --- a/src/microformats/parser.clj +++ b/src/microformats/parser.clj @@ -188,45 +188,59 @@ (list {:html (apply str (node-to-html content)) :value (apply str (node-to-text content))}))) -(defn parse-p - "Parse p-* classes within HTML element." - [loc] - (->> loc - z/node - element-to-classes - ((classes-to-props "p-")) - (r/map #(hash-map % (list (get-p-value loc)))) - (into {}))) +(declare continue-walking) -(defn parse-u - "Parse u-* classes within HTML element" - [loc] - (->> loc - z/node - element-to-classes - ((classes-to-props "u-")) - (r/map #(hash-map % (list (get-u-value loc)))) - (into {}))) +(defn gen-property-parser + "Create a property parser" + [f] + (fn [loc] + (apply (partial merge-with concat) + (f loc) + (continue-walking loc)))) -(defn parse-dt +(def parse-p + "Parse p-* classes within HTML element." + (gen-property-parser + (fn [loc] + (->> loc + z/node + element-to-classes + ((classes-to-props "p-")) + (r/map #(hash-map % (list (get-p-value loc)))) + (into {}))))) + +(def parse-u + "Parse u-* classes within HTML element" + (gen-property-parser + (fn [loc] + (->> loc + z/node + element-to-classes + ((classes-to-props "u-")) + (r/map #(hash-map % (list (get-u-value loc)))) + (into {}))))) + +(def parse-dt "Parse dt-* classes within HTML element" - [loc] - (->> loc - z/node - element-to-classes - ((classes-to-props "dt-")) - (r/map #(hash-map % (list (get-dt-value loc)))) - (into {}))) - -(defn parse-e + (gen-property-parser + (fn [loc] + (->> loc + z/node + element-to-classes + ((classes-to-props "dt-")) + (r/map #(hash-map % (list (get-dt-value loc)))) + (into {}))))) + +(def parse-e "Parse e-* classes within HTML element" - [loc] - (->> loc - z/node - element-to-classes - ((classes-to-props "e-")) - (r/map #(hash-map % (get-e-value loc))) - (into {}))) + (gen-property-parser + (fn [loc] + (->> loc + z/node + element-to-classes + ((classes-to-props "e-")) + (r/map #(hash-map % (get-e-value loc))) + (into {}))))) (defn- get-mf-names "Get the microformat names from an element" @@ -292,41 +306,37 @@ :url (list (parse-implied-url loc)) :photo (list (parse-implied-photo loc))}))) -(defn- select-p - [element] (zip-select element [[(html/but-node #{:br :hr}) (html/attr-contains :class "p-")]])) - -(defn- select-u - [element] (zip-select element [[(html/but-node #{:br :hr}) (html/attr-contains :class "u-")]])) - -(defn- select-dt - [element] (zip-select element [[(html/but-node #{:br :hr}) (html/attr-contains :class "dt-")]])) - -(defn- select-e - [element] (zip-select element [[(html/but-node #{:br :hr}) (html/attr-contains :class "e-")]])) - -(defn get-mf-properties - "Parse children of a microformat, returning a map of properties" - [loc] - (let [cappend (partial merge-with concat)] - (merge (parse-implied loc) - (apply cappend (map parse-p (select-p loc))) - (apply cappend (map parse-u (select-u loc))) - (apply cappend (map parse-dt (select-dt loc))) - (apply cappend (map parse-e (select-e loc)))))) - (defn parse-h "Parse h-* classes within a HTML element." [loc] (hash-map :type (get-mf-names (z/node loc)) - :properties (get-mf-properties loc))) - -(defn map-h - "Map fn to top-level h-* elements within a HTML element." - [fn loc] - (if (some->> loc z/node :attrs :class (re-matcher #"(?:^|\s) h-\w")) - (list (fn loc)) - (when (not (z/end? loc)) - (recur fn (z/next loc))))) + :properties (apply merge (parse-implied loc) + (continue-walking loc)))) + +(defn parse-mf + "Parse microformats within a HTML element." + [loc mf-type] + (case mf-type + "h" (parse-h loc) + "p" (parse-p loc) + "u" (parse-u loc) + "dt" (parse-dt loc) + "e" (parse-e loc))) + +(defn walk + "Walk HTML element tree for microformat properties." + [loc] + (when (and (not (z/end? loc)) + (not (contains? #{:br :hr} (-> loc z/node :tag)))) + (if-let [class-groups (some->> loc z/node :attrs :class (re-seq #"(?:^|\s)(h|p|u|dt|e)-\w+") (map second) distinct first list)] + (map (partial parse-mf loc) class-groups) + (recur (z/next loc))))) + +(defn continue-walking + "Keep walking that tree" + [loc] + (when (not (z/end? loc)) + (walk (z/next loc)))) (defn parse-rel "Parse rel attributes of an HTML link element" @@ -351,4 +361,4 @@ "Parse a HTML string with microformats" [html] (let [document (first (map z/xml-zip (html/html-snippet (str/trim html))))] - {:items (some->> document (map-h parse-h)) :rels (parse-rels document)})) + {:items (some-> document walk) :rels (parse-rels document)})) -- cgit 1.4.1