src/microformats/parser.clj (view raw)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 | (ns microformats.parser (:require [net.cgrand.enlive-html :as html] [clojure.core.reducers :as r] [clojure.string :as str])) (defn mf-names-from-class "Get microformat classnames from a class attribute" [prefix] (r/filter #(.startsWith % prefix))) (defn remove-mf-prefix "Remove microformats prefixes from a class attribute" [prefix] (r/map #(apply str (drop (count prefix) %)))) (defn- split-classes "Split a whitespace-separated string." [class] (str/split class #"\s+")) (defn classes-to-props "Convert class list to list of microformat property keywords" [prefix] (comp (r/map keyword) (remove-mf-prefix prefix) (mf-names-from-class prefix))) (defn element-to-classes "Get list of classes from an element" [el] (-> el :attrs :class split-classes)) (defn get-p-property "Get the p-x property value of an element" [el] (or (case (:tag el) :img (-> el :attrs :alt) :area (-> el :attrs :alt) :abbr (-> el :attrs :title) :data (-> el :attrs :value) :input (-> el :attrs :value) nil) (first (:content el)) "")) (defn get-u-property "Get the u-x property value of an element" [el] (or (case (:tag el) :a (-> el :attrs :href) :area (-> el :attrs :href) :img (-> el :attrs :src) :object (-> el :attrs :data) (get-p-property el)) (first :content el) "")) (defn get-dt-property "Get the dt-x property value of an element" [el] (or (case (:tag el) :time (-> el :attrs :datetime) :ins (-> el :attrs :datetime) :del (-> el :attrs :datetime) :abbr (-> el :attrs :title) :data (-> el :attrs :value) :input (-> el :attrs :value)) (first (:content el)) "")) (defn parse-p "Parse p-* classes within HTML element." [element] (let [prop (get-p-property element)] (into {} (r/map #(hash-map % prop) ((classes-to-props "p-") (element-to-classes element)))))) (defn parse-u "Parse u-* classes within HTML element" [element] (let [prop (get-u-property element)] (into {} (r/map #(hash-map % prop) ((classes-to-props "u-") (element-to-classes element)))))) (defn parse-dt "Parse dt-* classes within HTML element" [element] (let [prop (get-dt-property element)] (into {} (r/map #(hash-map % prop) ((classes-to-props "dt-") (element-to-classes element)))))) (defn parse-children "Parse element children for microformats" [element] (let [el (first (html/select element [(html/union [(html/attr-starts :class "p-") (html/attr-starts :class "u-") (html/attr-starts :class "dt-")])]))] (hash-map :properties (merge (parse-p el) (parse-u el))))) (defn parse-h "Parse h-* classes within a HTML document." [html] (mapv parse-children (html/select html [(html/attr-starts :class "h-")]))) (defn parse "Parse a HTML string with microformats" [html] (let [document (html/html-snippet html)] {:items (parse-h document) :rels {}})) |