about summary refs log tree commit diff stats
path: root/src/microformats/parser.clj
blob: 836fdcbc39ae70cac66fefee603fe8ac3a849dc3 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
(ns microformats.parser
  (:require [net.cgrand.enlive-html :as html]
            [clojure.core.reducers :as r]
            [clojure.string :as str]))

(defn mf-names-from-class
  "Get microformat classnames from a class attribute"
  [prefix]
  (r/filter #(.startsWith % prefix)))

(defn remove-mf-prefix
  "Remove microformats prefixes from a class attribute"
  [prefix]
  (r/map #(apply str (drop (count prefix) %))))

(defn- split-classes
  "Split a whitespace-separated string."
  [class]
  (str/split class #"\s+"))

(defn classes-to-props
  "Convert class list to list of microformat property keywords"
  [prefix]
  (comp (r/map keyword)
        (remove-mf-prefix prefix)
        (mf-names-from-class prefix)))

(defn element-to-classes
  "Get list of classes from an element"
  [el] (-> el
           :attrs
           :class
           split-classes))

(defn get-p-value
  "Get the p-x property value of an element"
  [el]
  (or (case (:tag el)
        :img (-> el :attrs :alt)
        :area (-> el :attrs :alt)
        :abbr (-> el :attrs :title)
        :data (-> el :attrs :value)
        :input (-> el :attrs :value)
        nil)
      (first (:content el))
      ""))

(defn get-u-value
  "Get the u-x property value of an element"
  [el]
  (or (case (:tag el)
        :a (-> el :attrs :href)
        :area (-> el :attrs :href)
        :img (-> el :attrs :src)
        :object (-> el :attrs :data)
        (get-p-value el))
      (first :content el)
      ""))

(defn get-dt-value
  "Get the dt-x property value of an element"
  [el]
  (or (case (:tag el)
        :time (-> el :attrs :datetime)
        :ins  (-> el :attrs :datetime)
        :del  (-> el :attrs :datetime)
        :abbr (-> el :attrs :title)
        :data (-> el :attrs :value)
        :input (-> el :attrs :value))
      (first (:content el))
      ""))

(defn parse-p
  "Parse p-* classes within HTML element."
  [element]
  (let [value (get-p-value element)]
    (into {} (r/map #(hash-map % value) ((classes-to-props "p-") (element-to-classes element))))))

(defn parse-u
  "Parse u-* classes within HTML element"
  [element]
  (let [value (get-u-value element)]
    (into {} (r/map #(hash-map % value) ((classes-to-props "u-") (element-to-classes element))))))

(defn parse-dt
  "Parse dt-* classes within HTML element"
  [element]
  (let [value (get-dt-value element)]
    (into {} (r/map #(hash-map % value) ((classes-to-props "dt-") (element-to-classes element))))))

(defn parse-children
  "Parse element children for microformats"
  [element]
  (let [el (first (html/select element [(html/union [(html/attr-starts :class "p-")
                                                     (html/attr-starts :class "u-")
                                                     (html/attr-starts :class "dt-")])]))]
    (hash-map :properties (merge (parse-p el) (parse-u el)))))

(defn parse-h
  "Parse h-* classes within a HTML document."
  [html]
  (mapv parse-children (html/select html [(html/attr-starts :class "h-")])))

(defn parse
  "Parse a HTML string with microformats"
  [html]
  (let [document (html/html-snippet html)]
    {:items (parse-h document) :rels {}}))