(ns microformats.parser (:require [net.cgrand.enlive-html :as html] [clojure.core.reducers :as r] [clojure.string :as str])) (defn mf-names-from-class "Get microformat classnames from a class attribute" [prefix] #(.startsWith % prefix)) (defn remove-mf-prefix "Remove microformats prefixes from a class attribute" [prefix] #(apply str (drop (count prefix) %))) (defn- split-ws-attribute "Split a whitespace-separated attribute." [class] (str/split class #"\s+")) (defn classes-to-props "Convert class list to list of microformat property keywords" [prefix] (comp (r/map keyword) (r/map (remove-mf-prefix prefix)) (r/filter (mf-names-from-class prefix)))) (defn element-to-classes "Get list of classes from an element" [el] (some-> el :attrs :class split-ws-attribute)) (defn element-to-rels "Get list of rels from an element" [el] (-> el :attrs :rel split-ws-attribute)) (defn- node-to-html "Turn a node into a list of HTML strings" [el] (map #(if (string? %) % (apply str (persistent! (html/emit-tag % (transient []))))) el)) (defn- node-to-text "Turn a node into a text string" [content] (->> content html/texts (map #(str/replace % #"\s+" " ")) (apply str) str/trim)) (defn get-value-class "Get the value class of elements" [elements] (str/join " " (into [] ((comp (r/map (partial apply str)) (r/map node-to-text) (r/map :content)) elements)))) (defn find-value-class "Find and get the value class of elements" [el] (when-let [values (seq (html/select el [html/root :> :.value]))] (get-value-class values))) (defn get-p-value "Get the p-x property value of an element" [el] (str/trim (or (find-value-class el) (case (:tag el) :img (-> el :attrs :alt) :area (-> el :attrs :alt) :abbr (-> el :attrs :title) :data (-> el :attrs :value) :input (-> el :attrs :value) nil) (node-to-text (:content el)) ""))) (defn get-u-value "Get the u-x property value of an element" [el] (str/trim (or (find-value-class el) (case (:tag el) :a (-> el :attrs :href) :area (-> el :attrs :href) :img (-> el :attrs :src) :object (-> el :attrs :data) (get-p-value el)) (node-to-text (:content el)) ""))) (defn get-dt-value "Get the dt-x property value of an element" [el] (str/trim (or (find-value-class el) (case (:tag el) :time (-> el :attrs :datetime) :ins (-> el :attrs :datetime) :del (-> el :attrs :datetime) :abbr (-> el :attrs :title) :data (-> el :attrs :value) :input (-> el :attrs :value)) (node-to-text (:content el)) ""))) (defn get-e-value "Get the e-x propery value of an element" [el] (let [content (:content el)] (list {:html (apply str (node-to-html content)) :value (apply str (node-to-text content))}))) (defn parse-p "Parse p-* classes within HTML element." [element] (->> element element-to-classes ((classes-to-props "p-")) (r/map #(hash-map % (list (get-p-value element)))) (into {}))) (defn parse-u "Parse u-* classes within HTML element" [element] (->> element element-to-classes ((classes-to-props "u-")) (r/map #(hash-map % (list (get-u-value element)))) (into {}))) (defn parse-dt "Parse dt-* classes within HTML element" [element] (->> element element-to-classes ((classes-to-props "dt-")) (r/map #(hash-map % (list (get-dt-value element)))) (into {}))) (defn parse-e "Parse e-* classes within HTML element" [element] (->> element element-to-classes ((classes-to-props "e-")) (r/map #(hash-map % (get-e-value element))) (into {}))) (defn- get-mf-names "Get the microformat names from an element" [element] (->> element element-to-classes (r/filter (mf-names-from-class "h-")) (into []))) (defn- imply-name "Imply the name of an entity from the element" [element] (list (node-to-text (:content element)))) (defn parse-implied "Parse implied properties of a HTML element" [element] {:name (imply-name element)}) (defn parse-h "Parse h-* classes within a HTML element." [element] (let [el (first (html/select element [(html/union [(html/attr-starts :class "p-") (html/attr-starts :class "u-") (html/attr-starts :class "dt-") (html/attr-starts :class "e-")])]))] (hash-map :type (get-mf-names element) :properties (merge (parse-implied element) (apply merge ((juxt parse-p parse-u parse-dt parse-e) el)))))) (defn select-h "Select h-* elements within a HTML document." [element] (html/select element [(html/attr-starts :class "h-")])) (defn parse-rel "Parse rel attributes of an HTML link element" [element] (->> element element-to-rels (map keyword) (map #(hash-map % [(-> element :attrs :href)])) (into {}))) (defn select-rels "Select linking HTML elements with rel attributes" [html] (html/select html [[#{:a :link} (html/attr? :rel)]])) (defn parse-rels "Parse rel attibutes of a set of HTML link elements" [elements] (or (apply merge-with into (map parse-rel (select-rels elements))) {})) (defn parse "Parse a HTML string with microformats" [html] (let [document (html/html-snippet html)] {:items (mapv parse-h (select-h document)) :rels (parse-rels document)}))