# Copyright (C) 2010 The Android Open Source Project # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # Tiny XML parser implementation in awk. # # This file is not meant to be used directly, instead copy the # functions it defines here into your own script then specialize # it appropriately. # # See further below for usage instructions and implementation details. # # ---------------------------- cut here --------------------------- function xml_event () { RS=">"; XML_TAG=XML_TYPE=""; split("", XML_ATTR); while ( 1 ) { if (_xml_closing) { # delayed direct tag closure XML_TAG = _xml_closing; XML_TYPE = "END"; _xml_closing = ""; _xml_exit(XML_TAG); return 1; } if (getline <= 0) return 0; # read new input line _xml_p = index($0, "<"); # get start marker if (_xml_p == 0) return 0; # end of file (or malformed input) $0 = substr($0, _xml_p) # remove anything before '<' # ignore CData / Comments / Processing instructions / Declarations if (_xml_in_section(" _xml_closing = XML_TAG; # record delayed tag closure. break } _xml_attrib = $0; sub(/=.*$/,"",_xml_attrib); # extract attribute name sub(/^[^=]*/,"",$0); # remove it from record _xml_attrib = tolower(_xml_attrib); if ( _xml_attrib !~ /^[a-z][-+_0-9a-z:]*$/ ) # validate it _xml_panic("Invalid attribute name: " _xml_attrib); if (substr($0,1,2) == "=\"") { # value is ="something" _xml_value = substr($0,3); sub(/".*$/,"",_xml_value); sub(/^="[^"]*"/,"",$0); } else if (substr($0,1,2) == "='") { # value is ='something' _xml_value = substr($0,3); sub(/'.*$/,"",_xml_value); sub(/^='[^']*'/,"",$0); } else { _xml_panic("Invalid attribute value syntax for " _xml_attrib ": " $0); } XML_ATTR[_xml_attrib] = _xml_value; # store attribute name/value sub(/^[ \t\r\n]*/,"",$0); # get rid of remaining leading spaces } return 1; # now return, XML_TYPE/TAG/ATTR/RPATH are set } } function _xml_panic (msg) { print msg > "/dev/stderr" exit(1) } function _xml_in_section (sec_begin, sec_end) { if (!match( $0, "^" sec_begin )) return 0; while (!match($0, sec_end "$")) { if (getline <= 0) _xml_panic("Unexpected EOF: " ERRNO); } return 1; } function _xml_enter (tag) { XML_RPATH = tag "/" XML_RPATH; } function _xml_exit (tag) { _xml_p = index(XML_RPATH, "/"); _xml_expected = substr(XML_RPATH, 1, _xml_p-1); if (_xml_expected != XML_TAG) _xml_panic("Unexpected close tag: " XML_TAG ", expecting " _xml_expected); XML_RPATH = substr(XML_RPATH, _xml_p+1); } # ---------------------------- cut here --------------------------- # USAGE: # # The functions provided here are used to extract the tags and attributes of a # given XML file. They do not support extraction of data, CDATA, comments, # processing instructions and declarations at all. # # You should use this from the BEGIN {} action of your awk script (it will # not work from an END {} action). # # Call xml_event() in a while loop. This functions returns 1 for each XML # 'event' encountered, or 0 when the end of input is reached. Note that in # case of malformed output, an error will be printed and the script will # force an exit(1) # # After each succesful xml_event() call, the following variables will be set: # # XML_TYPE: type of event: "BEGIN" -> mean an opening tag, "END" a # closing one. # # XML_TAG: name of the tag, always in UPPERCASE! # # XML_ATTR: a map of attributes for the type. Only set for "BEGIN" types. # all attribute names are in lowercase. # # beware: values are *not* unescaped ! # # XML_RPATH: the _reversed_ element path, using "/" as a separator. # if you are within the tag, then # it will be set to "APPLICATION/MANIFEST/" # (note the trailing slash). # # This is a simple example that dumps the output of the parsing. # BEGIN { while ( xml_event() ) { printf "XML_TYPE=%s XML_TAG=%s XML_RPATH=%s", XML_TYPE, XML_TAG, XML_RPATH; if (XML_TYPE == "BEGIN") { for (attr in XML_ATTR) { printf " %s='%s'", attr, XML_ATTR[attr]; } } printf "\n"; } } # IMPLEMENTATION DETAILS: # # 1. '>' as the record separator: # # RS is set to '>' to use this character as the record separator, instead of # the default '\n'. This means that something like the following: # # stuff # # will be translated into the following successive 'records': # # ' is never part of the records and thus will not be matched. # If the record does not contain a single '<', the input is either # malformed XML, or we reached the end of file with data after the last # '>'. # # Newlines in the original input are kept in the records as-is. # # 2. Getting rid of unwanted stuff: # # We don't need any of the data within elements, so we get rid of them by # simply ignoring anything before the '<' in the current record. This is # done with code like this: # # p = index($0, "<"); # get index of '<' # if (p == 0) -> return 0; # malformed input or end of file # $0 = substr($0, p+1); # remove anything before the '<' in record # # We also want to ignore certain sections like CDATA, comments, declarations, # etc.. These begin with a certain pattern and end with another one, e.g. # "" for comments. This is handled by the _xml_in_section() # function that accepts two patterns as input: # # sec_begin: is the pattern for the start of the record. # sec_end: is the pattern for the end of the record (minus trailing '>'). # # The function deals with the fact that these section can embed a valid '>' # and will then span multiple records, i.e. something like: # # # # will be decomposed into two records: # # "