xml/impl/src/com/intellij/xml/util/documentation/html5TagTableGen.rb


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85

require 'rexml/document'

# read html4 tags and attributes to be able to skip them
file = File.new("htmltable.xml")
doc = REXML::Document.new file
known4Tags = Set.new
known4Attributes = Set.new
doc.elements.each("html-property-table/tag") { |e| known4Tags << e.attributes["name"] }
doc.elements.each("html-property-table/attribute") { |e| known4Attributes << e.attributes["name"] }
file.close

# read html5 tags and attributes for verifying generated data
file = File.new("html5table.xml")
doc = REXML::Document.new file
known5Tags = Set.new
known5Attributes = Set.new
doc.elements.each("html-property-table/tag") { |e| known5Tags << e.attributes["name"] }
doc.elements.each("html-property-table/attribute") { |e| known5Attributes << e.attributes["name"] }
file.close

# read html5 spec
generatedTags = Set.new
result = "<html-property-table baseHelpRef=\"http://www.w3.org/html/wg/drafts/html/master/\">\n"
file = File.new("html5.html")
content = file.read
offset = 0
# parse tags
content.scan(/<tr><th><code><a href="([^"]+)">([^<]+).*<\/th>\s*<td>(?:<a href="[^"]+">)?([^<]+).*(<\/td>)?/) do |match|
  next if known4Tags.include?($2)
  startTag = true
  endTag = true
  nextTag = content.index("<tr>", ($~.offset(0)[1]))
  empty = content[$~.offset(0)[0]..nextTag].include?("empty")
  dtd = ""
  result +=
       "<tag name        = \"#{$2}\"\n" +
       "     helpref     = \"#{$1}\"\n" +
       "     description = \"#{$3}\"\n" +
       "     startTag    = \"#{startTag}\"\n" +
       "     endTag      = \"#{endTag}\"\n" +
       "     empty       = \"#{empty}\"\n" +
       "     dtd         = \"#{dtd}\"\n" +
       "/>\n"
  generatedTags << $2
  offset = $~.offset(0)[1]
end

generatedAttributes = Set.new
content[offset..-1].scan(/<tr><th>\s?<code(?:[^>]*)>([^<]+)\s*<\/code>\s*<td>([^;\n]*(?:;\s*[^;\n]*)*)\s*<td>\s*(.*)\s*<td>(.*)/) do
  next if known4Attributes.include?($1)
  name = $1
  field_and_link = $2
  description = $3
  type = $4
  type = type.gsub(/<[^>]*>/, "").gsub(/\s+/, " ").gsub(/"/, "").gsub(/^\s+/, "").gsub(/;$/, "")
  description = description.gsub(/<[^>]*>/, "").gsub(/\s+/, " ").gsub(/"/, "").gsub(/^\s+/, "").gsub(/;$/, "")
  helpref_match = field_and_link.match(/<a href="([^"]*)"/)
  helpref = helpref_match ? helpref_match[1] : ""
  relatedTags = field_and_link.gsub(/<[^>]*>/, "").gsub(/\s+/, " ").gsub(/"/, "").gsub(/^\s+/, "").gsub(/;$/, "")
  dtd = ""
  default = true
  result +=
       "<attribute name        = \"#{name}\"\n" +
       "           helpref     = \"#{helpref}\"\n" +
       "           description = \"#{description}\"\n" +
       "           relatedTags = \"#{relatedTags}\"\n" +
       "           dtd         = \"#{dtd}\"\n" +
       "           type        = \"#{type}\"\n" +
       "           default     = \"#{default}\"\n" +
       "/>\n"
  generatedAttributes << name
end
result += '</html-property-table>'
puts result


# verify that we haven't missed tags or attributes
if (!(generatedTags + known4Tags).superset?(known5Tags))
  printf $stderr, "warning! missing tags: #{(known5Tags - known4Tags - generatedTags).to_a.sort}\n"
end

if !(generatedAttributes + known4Attributes).superset?(known5Attributes)
  printf $stderr, "warning! missing attributes: #{(known5Attributes - known4Attributes - generatedAttributes).to_a.sort}\n"
end