blob: 836fdcbc39ae70cac66fefee603fe8ac3a849dc3 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
|
(ns microformats.parser
(:require [net.cgrand.enlive-html :as html]
[clojure.core.reducers :as r]
[clojure.string :as str]))
(defn mf-names-from-class
"Get microformat classnames from a class attribute"
[prefix]
(r/filter #(.startsWith % prefix)))
(defn remove-mf-prefix
"Remove microformats prefixes from a class attribute"
[prefix]
(r/map #(apply str (drop (count prefix) %))))
(defn- split-classes
"Split a whitespace-separated string."
[class]
(str/split class #"\s+"))
(defn classes-to-props
"Convert class list to list of microformat property keywords"
[prefix]
(comp (r/map keyword)
(remove-mf-prefix prefix)
(mf-names-from-class prefix)))
(defn element-to-classes
"Get list of classes from an element"
[el] (-> el
:attrs
:class
split-classes))
(defn get-p-value
"Get the p-x property value of an element"
[el]
(or (case (:tag el)
:img (-> el :attrs :alt)
:area (-> el :attrs :alt)
:abbr (-> el :attrs :title)
:data (-> el :attrs :value)
:input (-> el :attrs :value)
nil)
(first (:content el))
""))
(defn get-u-value
"Get the u-x property value of an element"
[el]
(or (case (:tag el)
:a (-> el :attrs :href)
:area (-> el :attrs :href)
:img (-> el :attrs :src)
:object (-> el :attrs :data)
(get-p-value el))
(first :content el)
""))
(defn get-dt-value
"Get the dt-x property value of an element"
[el]
(or (case (:tag el)
:time (-> el :attrs :datetime)
:ins (-> el :attrs :datetime)
:del (-> el :attrs :datetime)
:abbr (-> el :attrs :title)
:data (-> el :attrs :value)
:input (-> el :attrs :value))
(first (:content el))
""))
(defn parse-p
"Parse p-* classes within HTML element."
[element]
(let [value (get-p-value element)]
(into {} (r/map #(hash-map % value) ((classes-to-props "p-") (element-to-classes element))))))
(defn parse-u
"Parse u-* classes within HTML element"
[element]
(let [value (get-u-value element)]
(into {} (r/map #(hash-map % value) ((classes-to-props "u-") (element-to-classes element))))))
(defn parse-dt
"Parse dt-* classes within HTML element"
[element]
(let [value (get-dt-value element)]
(into {} (r/map #(hash-map % value) ((classes-to-props "dt-") (element-to-classes element))))))
(defn parse-children
"Parse element children for microformats"
[element]
(let [el (first (html/select element [(html/union [(html/attr-starts :class "p-")
(html/attr-starts :class "u-")
(html/attr-starts :class "dt-")])]))]
(hash-map :properties (merge (parse-p el) (parse-u el)))))
(defn parse-h
"Parse h-* classes within a HTML document."
[html]
(mapv parse-children (html/select html [(html/attr-starts :class "h-")])))
(defn parse
"Parse a HTML string with microformats"
[html]
(let [document (html/html-snippet html)]
{:items (parse-h document) :rels {}}))
|