aboutsummaryrefslogtreecommitdiff
path: root/runtime/C/src/antlr3filestream.c
diff options
context:
space:
mode:
Diffstat (limited to 'runtime/C/src/antlr3filestream.c')
-rw-r--r--runtime/C/src/antlr3filestream.c474
1 files changed, 474 insertions, 0 deletions
diff --git a/runtime/C/src/antlr3filestream.c b/runtime/C/src/antlr3filestream.c
new file mode 100644
index 0000000..4430ffe
--- /dev/null
+++ b/runtime/C/src/antlr3filestream.c
@@ -0,0 +1,474 @@
+/** \file
+ * \brief The ANTLR3 C filestream is used when the source character stream
+ * is a filesystem based input set and all the characters in the filestream
+ * can be loaded at once into memory and away the lexer goes.
+ *
+ * A number of initializers are provided in order that various character
+ * sets can be supported from input files. The ANTLR3 C runtime expects
+ * to deal with UTF32 characters only (the reasons for this are to
+ * do with the simplification of C code when using this form of Unicode
+ * encoding, though this is not a panacea. More information can be
+ * found on this by consulting:
+ * - http://www.unicode.org/versions/Unicode4.0.0/ch02.pdf#G11178
+ * Where a well grounded discussion of the encoding formats available
+ * may be found.
+ *
+ */
+
+// [The "BSD licence"]
+// Copyright (c) 2005-2009 Jim Idle, Temporal Wave LLC
+// http://www.temporal-wave.com
+// http://www.linkedin.com/in/jimidle
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+// 3. The name of the author may not be used to endorse or promote products
+// derived from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+// IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+// IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+// NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+// THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include <antlr3.h>
+
+static void setupInputStream (pANTLR3_INPUT_STREAM input);
+static pANTLR3_INPUT_STREAM antlr3CreateFileStream (pANTLR3_UINT8 fileName);
+static pANTLR3_INPUT_STREAM antlr3CreateStringStream (pANTLR3_UINT8 data);
+
+ANTLR3_API pANTLR3_INPUT_STREAM
+antlr3FileStreamNew(pANTLR3_UINT8 fileName, ANTLR3_UINT32 encoding)
+{
+ pANTLR3_INPUT_STREAM input;
+
+ // First order of business is to read the file into some buffer space
+ // as just straight 8 bit bytes. Then we will work out the encoding and
+ // byte order and adjust the API functions that are installed for the
+ // default 8Bit stream accordingly.
+ //
+ input = antlr3CreateFileStream(fileName);
+ if (input == NULL)
+ {
+ return NULL;
+ }
+
+ // We have the data in memory now so we can deal with it according to
+ // the encoding scheme we were given by the user.
+ //
+ input->encoding = encoding;
+
+ // Now we need to work out the endian type and install any
+ // API functions that differ from 8Bit
+ //
+ setupInputStream(input);
+
+ // Now we can set up the file name
+ //
+ input->istream->streamName = input->strFactory->newStr8(input->strFactory, fileName);
+ input->fileName = input->istream->streamName;
+
+ return input;
+}
+
+
+ANTLR3_API pANTLR3_INPUT_STREAM
+antlr3StringStreamNew(pANTLR3_UINT8 data, ANTLR3_UINT32 encoding, ANTLR3_UINT32 size, pANTLR3_UINT8 name)
+{
+ pANTLR3_INPUT_STREAM input;
+
+ // First order of business is to set up the stream and install the data pointer.
+ // Then we will work out the encoding and byte order and adjust the API functions that are installed for the
+ // default 8Bit stream accordingly.
+ //
+ input = antlr3CreateStringStream(data);
+ if (input == NULL)
+ {
+ return NULL;
+ }
+
+ // Size (in bytes) of the given 'string'
+ //
+ input->sizeBuf = size;
+
+ // We have the data in memory now so we can deal with it according to
+ // the encoding scheme we were given by the user.
+ //
+ input->encoding = encoding;
+
+ // Now we need to work out the endian type and install any
+ // API functions that differ from 8Bit
+ //
+ setupInputStream(input);
+
+ // Now we can set up the file name
+ //
+ input->istream->streamName = input->strFactory->newStr8(input->strFactory, name);
+ input->fileName = input->istream->streamName;
+
+ return input;
+}
+
+
+/// Determine endianess of the input stream and install the
+/// API required for the encoding in that format.
+///
+static void
+setupInputStream(pANTLR3_INPUT_STREAM input)
+{
+ ANTLR3_BOOLEAN isBigEndian;
+
+ // Used to determine the endianness of the machine we are currently
+ // running on.
+ //
+ ANTLR3_UINT16 bomTest = 0xFEFF;
+
+ // What endianess is the machine we are running on? If the incoming
+ // encoding endianess is the same as this machine's natural byte order
+ // then we can use more efficient API calls.
+ //
+ if (*((pANTLR3_UINT8)(&bomTest)) == 0xFE)
+ {
+ isBigEndian = ANTLR3_TRUE;
+ }
+ else
+ {
+ isBigEndian = ANTLR3_FALSE;
+ }
+
+ // What encoding did the user tell us {s}he thought it was? I am going
+ // to get sick of the questions on antlr-interest, I know I am.
+ //
+ switch (input->encoding)
+ {
+ case ANTLR3_ENC_UTF8:
+
+ // See if there is a BOM at the start of this UTF-8 sequence
+ // and just eat it if there is. Windows .TXT files have this for instance
+ // as it identifies UTF-8 even though it is of no consequence for byte order
+ // as UTF-8 does not have a byte order.
+ //
+ if ( (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar)) == 0xEF
+ && (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar+1)) == 0xBB
+ && (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar+2)) == 0xBF
+ )
+ {
+ // The UTF8 BOM is present so skip it
+ //
+ input->nextChar = (void *)((pANTLR3_UINT8)input->nextChar + 3);
+ }
+
+ // Install the UTF8 input routines
+ //
+ antlr3UTF8SetupStream(input);
+ break;
+
+ case ANTLR3_ENC_UTF16:
+
+ // See if there is a BOM at the start of the input. If not then
+ // we assume that the byte order is the natural order of this
+ // machine (or it is really UCS2). If there is a BOM we determine if the encoding
+ // is the same as the natural order of this machine.
+ //
+ if ( (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar)) == 0xFE
+ && (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar+1)) == 0xFF
+ )
+ {
+ // BOM Present, indicates Big Endian
+ //
+ input->nextChar = (void *)((pANTLR3_UINT8)input->nextChar + 2);
+
+ antlr3UTF16SetupStream(input, isBigEndian, ANTLR3_TRUE);
+ }
+ else if ( (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar)) == 0xFF
+ && (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar+1)) == 0xFE
+ )
+ {
+ // BOM present, indicates Little Endian
+ //
+ input->nextChar = (void *)((pANTLR3_UINT8)input->nextChar + 2);
+
+ antlr3UTF16SetupStream(input, isBigEndian, ANTLR3_FALSE);
+ }
+ else
+ {
+ // No BOM present, assume local computer byte order
+ //
+ antlr3UTF16SetupStream(input, isBigEndian, isBigEndian);
+ }
+ break;
+
+ case ANTLR3_ENC_UTF32:
+
+ // See if there is a BOM at the start of the input. If not then
+ // we assume that the byte order is the natural order of this
+ // machine. If there is we determine if the encoding
+ // is the same as the natural order of this machine.
+ //
+ if ( (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar)) == 0x00
+ && (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar+1)) == 0x00
+ && (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar+2)) == 0xFE
+ && (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar+3)) == 0xFF
+ )
+ {
+ // BOM Present, indicates Big Endian
+ //
+ input->nextChar = (void *)((pANTLR3_UINT8)input->nextChar + 4);
+
+ antlr3UTF32SetupStream(input, isBigEndian, ANTLR3_TRUE);
+ }
+ else if ( (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar)) == 0xFF
+ && (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar+1)) == 0xFE
+ && (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar+1)) == 0x00
+ && (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar+1)) == 0x00
+ )
+ {
+ // BOM present, indicates Little Endian
+ //
+ input->nextChar = (void *)((pANTLR3_UINT8)input->nextChar + 4);
+
+ antlr3UTF32SetupStream(input, isBigEndian, ANTLR3_FALSE);
+ }
+ else
+ {
+ // No BOM present, assume local computer byte order
+ //
+ antlr3UTF32SetupStream(input, isBigEndian, isBigEndian);
+ }
+ break;
+
+ case ANTLR3_ENC_UTF16BE:
+
+ // Encoding is definately Big Endian with no BOM
+ //
+ antlr3UTF16SetupStream(input, isBigEndian, ANTLR3_TRUE);
+ break;
+
+ case ANTLR3_ENC_UTF16LE:
+
+ // Encoding is definately Little Endian with no BOM
+ //
+ antlr3UTF16SetupStream(input, isBigEndian, ANTLR3_FALSE);
+ break;
+
+ case ANTLR3_ENC_UTF32BE:
+
+ // Encoding is definately Big Endian with no BOM
+ //
+ antlr3UTF32SetupStream(input, isBigEndian, ANTLR3_TRUE);
+ break;
+
+ case ANTLR3_ENC_UTF32LE:
+
+ // Encoding is definately Little Endian with no BOM
+ //
+ antlr3UTF32SetupStream(input, isBigEndian, ANTLR3_FALSE);
+ break;
+
+ case ANTLR3_ENC_EBCDIC:
+
+ // EBCDIC is basically the same as ASCII but with an on the
+ // fly translation to ASCII
+ //
+ antlr3EBCDICSetupStream(input);
+ break;
+
+ case ANTLR3_ENC_8BIT:
+ default:
+
+ // Standard 8bit/ASCII
+ //
+ antlr38BitSetupStream(input);
+ break;
+ }
+}
+
+/** \brief Use the contents of an operating system file as the input
+ * for an input stream.
+ *
+ * \param fileName Name of operating system file to read.
+ * \return
+ * - Pointer to new input stream context upon success
+ * - One of the ANTLR3_ERR_ defines on error.
+ */
+static pANTLR3_INPUT_STREAM
+antlr3CreateFileStream(pANTLR3_UINT8 fileName)
+{
+ // Pointer to the input stream we are going to create
+ //
+ pANTLR3_INPUT_STREAM input;
+ ANTLR3_UINT32 status;
+
+ if (fileName == NULL)
+ {
+ return NULL;
+ }
+
+ // Allocate memory for the input stream structure
+ //
+ input = (pANTLR3_INPUT_STREAM)
+ ANTLR3_CALLOC(1, sizeof(ANTLR3_INPUT_STREAM));
+
+ if (input == NULL)
+ {
+ return NULL;
+ }
+
+ // Structure was allocated correctly, now we can read the file.
+ //
+ status = antlr3read8Bit(input, fileName);
+
+ // Call the common 8 bit input stream handler
+ // initialization.
+ //
+ antlr3GenericSetupStream(input);
+
+ // However if the file was not there or something then we
+ // need to close. Have to wait until here as we cannot call
+ // close until the API is installed of course.
+ //
+ if (status != ANTLR3_SUCCESS)
+ {
+ input->close(input);
+ return NULL;
+ }
+
+ return input;
+}
+
+ANTLR3_API ANTLR3_UINT32
+antlr3read8Bit(pANTLR3_INPUT_STREAM input, pANTLR3_UINT8 fileName)
+{
+ ANTLR3_FDSC infile;
+ ANTLR3_UINT32 fSize;
+
+ /* Open the OS file in read binary mode
+ */
+ infile = antlr3Fopen(fileName, "rb");
+
+ /* Check that it was there
+ */
+ if (infile == NULL)
+ {
+ return (ANTLR3_UINT32)ANTLR3_ERR_NOFILE;
+ }
+
+ /* It was there, so we can read the bytes now
+ */
+ fSize = antlr3Fsize(fileName); /* Size of input file */
+
+ /* Allocate buffer for this input set
+ */
+ input->data = ANTLR3_MALLOC((size_t)fSize);
+ input->sizeBuf = fSize;
+
+ if (input->data == NULL)
+ {
+ return (ANTLR3_UINT32)ANTLR3_ERR_NOMEM;
+ }
+
+ input->isAllocated = ANTLR3_TRUE;
+
+ /* Now we read the file. Characters are not converted to
+ * the internal ANTLR encoding until they are read from the buffer
+ */
+ antlr3Fread(infile, fSize, input->data);
+
+ /* And close the file handle
+ */
+ antlr3Fclose(infile);
+
+ return ANTLR3_SUCCESS;
+}
+
+/** \brief Open an operating system file and return the descriptor
+ * We just use the common open() and related functions here.
+ * Later we might find better ways on systems
+ * such as Windows and OpenVMS for instance. But the idea is to read the
+ * while file at once anyway, so it may be irrelevant.
+ */
+ANTLR3_API ANTLR3_FDSC
+antlr3Fopen(pANTLR3_UINT8 filename, const char * mode)
+{
+ return (ANTLR3_FDSC)fopen((const char *)filename, mode);
+}
+
+/** \brief Close an operating system file and free any handles
+ * etc.
+ */
+ANTLR3_API void
+antlr3Fclose(ANTLR3_FDSC fd)
+{
+ fclose(fd);
+}
+ANTLR3_API ANTLR3_UINT32
+antlr3Fsize(pANTLR3_UINT8 fileName)
+{
+ struct _stat statbuf;
+
+ _stat((const char *)fileName, &statbuf);
+
+ return (ANTLR3_UINT32)statbuf.st_size;
+}
+
+ANTLR3_API ANTLR3_UINT32
+antlr3Fread(ANTLR3_FDSC fdsc, ANTLR3_UINT32 count, void * data)
+{
+ return (ANTLR3_UINT32)fread(data, (size_t)count, 1, fdsc);
+}
+
+
+/** \brief Use the supplied 'string' as input to the stream
+ *
+ * \param data Pointer to the input data
+ * \return
+ * - Pointer to new input stream context upon success
+ * - NULL defines on error.
+ */
+static pANTLR3_INPUT_STREAM
+antlr3CreateStringStream(pANTLR3_UINT8 data)
+{
+ // Pointer to the input stream we are going to create
+ //
+ pANTLR3_INPUT_STREAM input;
+
+ if (data == NULL)
+ {
+ return NULL;
+ }
+
+ // Allocate memory for the input stream structure
+ //
+ input = (pANTLR3_INPUT_STREAM)
+ ANTLR3_CALLOC(1, sizeof(ANTLR3_INPUT_STREAM));
+
+ if (input == NULL)
+ {
+ return NULL;
+ }
+
+ // Structure was allocated correctly, now we can install the pointer
+ //
+ input->data = data;
+ input->isAllocated = ANTLR3_FALSE;
+
+ // Call the common 8 bit input stream handler
+ // initialization.
+ //
+ antlr3GenericSetupStream(input);
+
+ return input;
+} \ No newline at end of file