(ns microformats.parser (:require [net.cgrand.enlive-html :as html] [clojure.zip :as z] [clojure.core.reducers :as r] [clojure.string :as str] [clojurewerkz.urly.core :as url])) (defn- as-locs [loc] (if (and (vector? loc) (= (count loc) 2) (not (vector? (first loc)))) [loc] loc)) (defn- zip-select "Workaround https://github.com/cgrand/enlive/issues/117" [loc-or-locs selector] (html/zip-select (as-locs loc-or-locs) (list (@#'html/automaton selector)))) (defmacro anacond [& clauses] (when clauses (list 'if-let ['% (first clauses)] (if (next clauses) (second clauses) (throw (IllegalArgumentException. "anacond requires an even number of forms"))) (cons 'anacond (next (next clauses)))))) (defn prefixed-by? [prefix] #(.startsWith % prefix)) (defn remove-mf-prefix "Remove microformats prefixes from a class attribute" [prefix] #(apply str (drop (count prefix) %))) (defn- split-ws-attribute "Split a whitespace-separated attribute." [class] (str/split class #"\s+")) (defn classes-to-props "Convert class list to list of microformat property keywords" [prefix] (comp (r/map keyword) (r/map (remove-mf-prefix prefix)) (r/filter (prefixed-by? prefix)))) (defn element-to-classes "Get list of classes from an element" [el] (some-> el :attrs :class split-ws-attribute)) (defn element-to-rels "Get list of rels from an element" [el] (-> el :attrs :rel split-ws-attribute)) (defn- node-to-html "Turn a node into a list of HTML strings" [el] (map #(if (string? %) % (apply str (persistent! (html/emit-tag % (transient []))))) el)) (defn- node-to-text "Turn a node into a text string" [content] (->> content html/texts (apply str) (#(str/replace % #"\s+" " ")) str/trim)) (defn get-base-url "Find the base-url of a document." [document] (or (-> document (html/select [:head :> [:base (html/attr? :href)]]) first :attrs :href) "")) (defn normalise-url "Normalise a URL" [root url] (url/resolve (get-base-url root) url)) (defn get-value-title-class "Get the value-title class of elements" [elements] (str/join " " (into [] ((comp (r/map :title) (r/map :attrs)) elements)))) (defn get-value-class "Get the value class of elements" [elements] (str/join " " (into [] ((comp (r/map (partial apply str)) (r/map node-to-text) (r/map :content)) elements)))) (defn find-value-class "Find and get the value class of elements" [el] (anacond (not-empty (html/select el [html/root :> :.value-title])) (get-value-title-class %) (not-empty (html/select el [html/root :> :.value ])) (get-value-class %))) (declare parse-h) (defn remove-property-classes [element] (into {} (html/transform (list element) [html/root] (apply html/remove-class (filter (prefixed-by? "p-") (element-to-classes element)))))) (defn get-child-mf-properties [loc] (assoc (parse-h (z/edit loc remove-property-classes)) :value (-> loc z/node :content node-to-text))) (defn- find-child-mf "Find child property microformats of an element." [loc] (let [element (z/node loc)] (when (-> element :attrs :class (.indexOf "h-") (>= 0)) (get-child-mf-properties loc)))) (defn get-p-value "Get the p-x property value of an element" [loc] (let [el (z/node loc)] (or (find-child-mf loc) (str/trim (or (find-value-class el) (case (:tag el) :img (-> el :attrs :alt) :area (-> el :attrs :alt) :abbr (-> el :attrs :title) :data (-> el :attrs :value) :input (-> el :attrs :value) nil) (node-to-text (:content el)) ""))))) (defn get-u-value "Get the u-x property value of an element" [loc] (let [el (z/node loc)] (str/trim (or (find-value-class el) (case (:tag el) :a (normalise-url (z/root loc) (-> el :attrs :href)) :area (normalise-url (z/root loc) (-> el :attrs :href)) :img (normalise-url (z/root loc) (-> el :attrs :src)) :object (normalise-url (z/root loc) (-> el :attrs :data)) (get-p-value loc)) (node-to-text (:content el)) "")))) (defn get-dt-value "Get the dt-x property value of an element" [loc] (let [el (z/node loc)] (str/trim (or (find-value-class el) (case (:tag el) :time (-> el :attrs :datetime) :ins (-> el :attrs :datetime) :del (-> el :attrs :datetime) :abbr (-> el :attrs :title) :data (-> el :attrs :value) :input (-> el :attrs :value) nil) (node-to-text (:content el)) "")))) (defn get-e-value "Get the e-x propery value of an element" [loc] (let [el (z/node loc) content (:content el)] (list {:html (apply str (node-to-html content)) :value (apply str (node-to-text content))}))) (declare continue-walking) (defn gen-property-parser "Create a property parser" [f] (fn [loc] (apply (partial merge-with concat) (f loc) (continue-walking loc)))) (def parse-p "Parse p-* classes within HTML element." (gen-property-parser (fn [loc] (->> loc z/node element-to-classes ((classes-to-props "p-")) (r/map #(hash-map % (list (get-p-value loc)))) (into {}))))) (def parse-u "Parse u-* classes within HTML element" (gen-property-parser (fn [loc] (->> loc z/node element-to-classes ((classes-to-props "u-")) (r/map #(hash-map % (list (get-u-value loc)))) (into {}))))) (def parse-dt "Parse dt-* classes within HTML element" (gen-property-parser (fn [loc] (->> loc z/node element-to-classes ((classes-to-props "dt-")) (r/map #(hash-map % (list (get-dt-value loc)))) (into {}))))) (def parse-e "Parse e-* classes within HTML element" (gen-property-parser (fn [loc] (->> loc z/node element-to-classes ((classes-to-props "e-")) (r/map #(hash-map % (get-e-value loc))) (into {}))))) (defn- get-mf-names "Get the microformat names from an element" [element] (->> element element-to-classes (filter (prefixed-by? "h-")))) (defn- parse-implied-name "Get the implied name of an entity" [loc] (let [element (z/node loc)] (case (:tag element) :abbr (-> element :attrs :title) :img (-> element :attrs :alt) (anacond (first (html/select element [html/root :> [:img html/only-child]])) (-> % :attrs :alt) (first (html/select element [html/root :> [:abbr html/only-child (html/attr? :title)]])) (-> % :attrs :title) (first (html/select element [html/root :> html/only-child :> [:img html/only-child]])) (-> % :attrs :alt) (first (html/select element [html/root :> html/only-child :> [:abbr html/only-child (html/attr? :title)]])) (-> % :attrs :title) true (node-to-text (:content element)))))) (defn- parse-implied-url [loc] (let [element (z/node loc)] (some->> (case (:tag element) :a (-> element :attrs :href) (if-let [% (first (html/select element [html/root :> [:a (html/attr? :href) html/only-of-type (html/but-node (html/attr-contains :class "h-"))]]))] (-> % :attrs :href))) (normalise-url (z/root loc))))) (defn- parse-implied-photo [loc] (let [element (z/node loc)] (some->> (case (:tag element) :img (-> element :attrs :src) :object (-> element :attrs :data) (anacond (first (html/select element [html/root :> [:img (html/but-node (html/attr-contains :class "h-")) html/only-of-type]])) (-> % :attrs :src) (first (html/select element [html/root :> [:object (html/but-node (html/attr-contains :class "h-")) html/only-of-type]])) (-> % :attrs :data) (first (html/select element [html/root :> html/only-child :> [:img (html/but-node (html/attr-contains :class "h-")) html/only-of-type]])) (-> % :attrs :src) (first (html/select element [html/root :> html/only-child :> [:object (html/but-node (html/attr-contains :class "h-")) html/only-of-type]])) (-> % :attrs :data))) (normalise-url (z/root loc))))) (def empty-ish #(not (str/blank? (first (second %))))) (defn parse-implied "Parse implied properties of a HTML element" [loc] (into {} (filter empty-ish {:name (list (parse-implied-name loc)) :url (list (parse-implied-url loc)) :photo (list (parse-implied-photo loc))}))) (defn parse-h "Parse h-* classes within a HTML element." [loc] (hash-map :type (get-mf-names (z/node loc)) :properties (apply merge (parse-implied loc) (continue-walking loc)))) (defn parse-mf "Parse microformats within a HTML element." [loc mf-type] (case mf-type "h" (parse-h loc) "p" (parse-p loc) "u" (parse-u loc) "dt" (parse-dt loc) "e" (parse-e loc))) (defn walk "Walk HTML element tree for microformat properties." [loc] (when (and (not (z/end? loc)) (not (contains? #{:br :hr} (-> loc z/node :tag)))) (map (partial parse-mf loc) class-groups) (if-let [types (some->> loc z/node :attrs :class (re-seq #"(?:^|\s)(h|p|u|dt|e)-\w+") (map second) set)] (recur (z/next loc))))) (defn continue-walking "Keep walking that tree" [loc] (when (not (z/end? loc)) (walk (z/next loc)))) (defn parse-rel "Parse rel attributes of an HTML link element" [loc] (->> loc z/node element-to-rels (map keyword) (map #(hash-map % [(normalise-url (z/root loc) (-> loc z/node :attrs :href))])) (into {}))) (defn select-rels "Select linking HTML elements with rel attributes" [html] (zip-select html [[#{:a :link} (html/attr? :rel)]])) (defn parse-rels "Parse rel attibutes of a set of HTML link elements" [locs] (or (apply merge-with into (map parse-rel (select-rels locs))) {})) (defn parse "Parse a HTML string with microformats" [html] (let [document (first (map z/xml-zip (html/html-snippet (str/trim html))))] {:items (some-> document walk) :rels (parse-rels document)}))