diff options
author | Alan Pearce | 2014-10-12 00:52:36 +0100 |
---|---|---|
committer | Alan Pearce | 2014-10-12 00:52:36 +0100 |
commit | 590b586073457434ade3d6ebbfc6f2ca72cc8806 (patch) | |
tree | 2ff215902ed05bb5507dd5884ef0c1cee9b515ab /src | |
parent | 8d3b9dc5f28587d0290ebf6f7c0bedf0866f5b62 (diff) | |
download | microformats-590b586073457434ade3d6ebbfc6f2ca72cc8806.tar.lz microformats-590b586073457434ade3d6ebbfc6f2ca72cc8806.tar.zst microformats-590b586073457434ade3d6ebbfc6f2ca72cc8806.zip |
Ensure child microformats are only parsed once
Diffstat (limited to 'src')
-rw-r--r-- | src/microformats/parser.clj | 13 |
1 files changed, 12 insertions, 1 deletions
diff --git a/src/microformats/parser.clj b/src/microformats/parser.clj index 92104dc..759f84c 100644 --- a/src/microformats/parser.clj +++ b/src/microformats/parser.clj @@ -2,6 +2,7 @@ (:require [net.cgrand.enlive-html :as html] [clojure.zip :as z] [clojure.core.reducers :as r] + [clojure.set :as set] [clojure.string :as str] [clojurewerkz.urly.core :as url])) @@ -323,13 +324,23 @@ "dt" (parse-dt loc) "e" (parse-e loc))) +(defn has-child? + [types] (set/subset? #{"p" "h"} types)) + +(defn single-pass-child + "Ensure a child microformat of a property is only parsed as a child" + [types] + (if (has-child? types) + (remove #(= "h" %) types) + types)) + (defn walk "Walk HTML element tree for microformat properties." [loc] (when (and (not (z/end? loc)) (not (contains? #{:br :hr} (-> loc z/node :tag)))) - (map (partial parse-mf loc) class-groups) (if-let [types (some->> loc z/node :attrs :class (re-seq #"(?:^|\s)(h|p|u|dt|e)-\w+") (map second) set)] + (map (partial parse-mf loc) (single-pass-child types)) (recur (z/next loc))))) (defn continue-walking |