From 961d32f58072b5fa01ae6e2135f854ab318e5b64 Mon Sep 17 00:00:00 2001
From: David Morgan <davee@x-fusion.co.uk>
Date: Wed, 2 Nov 2016 00:41:11 +0000
Subject: [PATCH 1/1] import codebase

---
 CMakeLists.txt          |  17 +++
 LICENSE                 |  21 ++++
 README.md               | 188 ++++++++++++++++++++++++++++++
 include/taihen/lexer.h  |  36 ++++++
 include/taihen/parser.h |  16 +++
 src/CMakeLists.txt      |  10 ++
 src/lexer.c             | 209 +++++++++++++++++++++++++++++++++
 src/parser.c            | 249 ++++++++++++++++++++++++++++++++++++++++
 test/CMakeLists.txt     |  10 ++
 test/test_lexer.cpp     | 208 +++++++++++++++++++++++++++++++++
 test/test_parser.cpp    |   9 ++
 11 files changed, 973 insertions(+)
 create mode 100644 CMakeLists.txt
 create mode 100644 LICENSE
 create mode 100644 README.md
 create mode 100644 include/taihen/lexer.h
 create mode 100644 include/taihen/parser.h
 create mode 100644 src/CMakeLists.txt
 create mode 100644 src/lexer.c
 create mode 100644 src/parser.c
 create mode 100644 test/CMakeLists.txt
 create mode 100644 test/test_lexer.cpp
 create mode 100644 test/test_parser.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 0000000..4a1df26
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,17 @@
+cmake_minimum_required(VERSION 3.1.0)
+
+project(taihen-config)
+include_directories(include)
+
+add_subdirectory(src)
+
+if (not ${TEST})
+    add_subdirectory(test)
+
+    enable_testing()
+    add_test(NAME LexerTest COMMAND test-lexer)
+endif()
+
+install(DIRECTORY include/taihen/
+    DESTINATION include/taihen
+    FILES_MATCHING PATTERN "*.h")
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..c2582a7
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,21 @@
+The MIT License (MIT)
+
+Copyright (c) 2016 David "Davee" Morgan
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
\ No newline at end of file
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..4c83852
--- /dev/null
+++ b/README.md
@@ -0,0 +1,188 @@
+# taihen-parser - _taiHEN's configuration parser_
+
+taiHEN is a custom firmware (CFW) framework for PS Vitaâ¢ and implements a configuration to help packagers and users control which modules or plugins are loaded and when.
+taihen-parser provides a convenient C API for interacting with these configuration files to help developers write supporting tools for taiHEN. taihen-parser provides both a lexer and parser API for configuration files.
+
+The problem with CFW of a previous era was that it was one person's imagination of a custom firmware. Average developer X could not easily replace the in-game menu within the provided CFW. Likewise, average user Y could not strip out features they did not like. Here, taiHEN provides a solution by providing a configuration file where CFW can be defined as a set of modules and plugins. 
+
+Person X may like live-area mod1 and person Y may like live-area mod2. No longer do these two need to chose between CFW A and CFW B that implements mod1 and mod2, respectively. Instead, they can modify this configuration from their favourite CFW to use whichever mod they prefer. This architecture promotes the _custom_ in _custom firmware_ by encouraging developers to move away from huge monolithic CFW of the past and help nurture an open, compatible and _user orientated_ custom firmware experience.
+
+## Configuration Format
+taiHEN employs a text based format for configuring automatic loading of modules. The configuration format is a UTF-8 text file that utilises line seperation to ease parsing and human readability. Each line must be exclusive to one of four types:
+ - An empty line
+ - A comment
+ - Section
+ - Module path
+
+Each line can be at most ```CONFIG_MAX_LINE_LENGTH``` characters wide, and trailing/leading whitespace is permitted.
+
+## Lexer Tokens
+The config lexer produces the following tokens:
+ - ```CONFIG_START_TOKEN```
+ - ```CONFIG_END_TOKEN```
+ - ```CONFIG_COMMENT_TOKEN```
+ - ```CONFIG_SECTION_TOKEN```
+ - ```CONFIG_SECTION_HALT_TOKEN```
+ - ```CONFIG_SECTION_NAME_TOKEN```
+ - ```CONFIG_PATH_TOKEN```
+
+A valid configuration format should obey the grammar:
+```
+config ::= CONFIG_START_TOKEN (CONFIG_COMMENT_TOKEN | section)* CONFIG_END_TOKEN
+section ::= CONFIG_SECTION_TOKEN CONFIG_SECTION_HALT_TOKEN? CONFIG_SECTION_NAME_TOKEN ('\n' | EOF) path*
+path ::= CONFIG_PATH_TOKEN ('\n' | EOF)
+```
+
+## Sections: ```*```
+A section in the configuration file functions as a filter and controller for CFW module loading.
+Each section begins with a ```*``` and can optionally be followed with a ```!``` to mark the section as a halt point (see further below). After these tokens, the rest of the line a UTF-8 name for the section.
+
+A section of the same name may appear in the file multiple times. This functionality is intended to allow users to take advantage of taiHEN's load ordering policy.
+
+### Halt point: ```!```
+A section can optionally have the halt point token ```!``` following the section token ```*``` in the configuration file. This token instructs the parser to stop further parsing of the file if the section name is within context. See the examples below for a visual worked case on this feature.
+
+### Reserved names
+There are currently two reserved names for sections:
+ - ```ALL``` - A catch all user-mode section that will load the modules it contains for every user-mode process.
+ - ```KERNEL``` - A section that loads resident kernel modules on the start of taiHEN.
+
+Using the halt point ```!``` on these sections results in undefined behaviour.
+
+## API
+This API currently offers no guarantee of stability. Please remember that it may change drastically upon any future versions of taiHEN.
+taiHEN's configuration parser exposes it's lexer algorithm to assist in development of supporting tools. Please consult the header files for documentation.
+
+## Example Configurations
+
+Below is an example of a very simple configuration:
+```
+# example simple config
+*ALL
+ux0:/plugins/my_plugin.suprx
+ux0:/plugins/my_plugin2.suprx
+```
+
+This example consists of a single section ```ALL```. Which means that every game/application/homebrew that is launched will have both ```my_plugin.suprx``` and ```my_plugin2.suprx``` loaded in that process space and in order.
+
+More precise functionality may be required for certain homebrew. Perhaps you wish to package your own CFW, in which case you may create a complex configuration as shown below:
+```
+# hello this is a comment. this line is ignored
+   # this line also
+        # this too, whitespace at the start of a line is OK
+*COOL_GAME
+# i'm within a section, woo!
+    ux0:/coolgame/plugin.suprx
+    # indentation is ok with me
+    ux0:/coolgame/plugin2.suprx
+	# spaces within path is ok
+	ux0:/really cool/I haVe spaces and caps/plugin3.suprx
+# next section
+*ALL
+    # i'm a special section!
+    # i'm always included... usually
+    ux0:/plugins/ingamemusic.suprx
+*KERNEL
+    # i'm a special section also!
+    # my plugins are loaded to kernel memory as resident modules
+    ux0:/taihen/henkaku.skprx
+    ux0:/psphacks.skprx
+*COOL_GAME
+    # this section again?! this is ok! this is a way packagers
+    # can take advantage of load order.
+    ux0:/coolgame/idependoningamemusic.suprx
+*!COOL_GAME2
+    # what is the '!' for?
+    # the '!' prevents further parsing
+    # this would make more sense to put at the start if you want to
+    # blacklist certain modules
+    # look, nothing to load!
+*ALL
+    ux0:/plugins/ibreak_coolgame2.suprx
+	
+	# emojis?
+	ux0:/ð¤/ð¦/ð»/ð.suprx
+```
+Much more complex, but really I expect even more complexity when real CFW components come around. As mentioned previously, parsing occurs from top to bottom, identical to load order. When parsing, a section context is selected. In the case of taiHEN, this context is a title id such as ```MLCL00001``` for our molecularShell homebrew. In this case, lets assume for ease that we have selected ```COOL_GAME``` and it is a user-mode process.
+
+Comments are ignored, so lets continue until we reach the first section: ```COOL_GAME```. Since our selected section matches this first section, the paths below are loaded until a new section is reached.
+ - ```ux0:/coolgame/plugin.suprx```
+ - ```ux0:/coolgame/plugin2.suprx```
+ - ```ux0:/really cool/I haVe spaces and caps/plugin3.suprx```
+
+Then we reach a new section ```ALL```. As mentioned above, ```ALL``` is a special reserved section name that matches every user-mode process. So our loaded module list grows:
+ - ```ux0:/coolgame/plugin.suprx```
+ - ```ux0:/coolgame/plugin2.suprx```
+ - ```ux0:/really cool/I haVe spaces and caps/plugin3.suprx```
+ - ```ux0:/plugins/ingamemusic.suprx```
+
+Next section we reach is the special section ```KERNEL```. This is not processed within our context we so continue until we reach the next section: ```COOL_GAME```. We have already had this section before, but we multiple sections are allowed to take advantage of taiHEN's module load ordering. This is extremely useful when you have dependencies between plugins/modules that need resolved. In this example we have ```idependoningamemusic.suprx``` which must be loaded after ```ingamemusic.suprx```.
+
+Our load list now looks like:
+ - ```ux0:/coolgame/plugin.suprx```
+ - ```ux0:/coolgame/plugin2.suprx```
+ - ```ux0:/really cool/I haVe spaces and caps/plugin3.suprx```
+ - ```ux0:/plugins/ingamemusic.suprx```
+ - ```ux0:/coolgame/idependoningamemusic.suprx```
+
+Next section is ```COOL_GAME2``` which does not match our section context. This section has a halt point ```!``` but we ignore it in this case because we do not much it.
+
+Lastly, we have the final section ```ALL``` again, which completes our load list:
+ - ```ux0:/coolgame/plugin.suprx```
+ - ```ux0:/coolgame/plugin2.suprx```
+ - ```ux0:/really cool/I haVe spaces and caps/plugin3.suprx```
+ - ```ux0:/plugins/ingamemusic.suprx```
+ - ```ux0:/coolgame/idependoningamemusic.suprx```
+ - ```ux0:/plugins/ibreak_coolgame2.suprx```
+ - ```ux0:/ð¤/ð¦/ð»/ð.suprx```
+
+NOTE: I don't know conclusively if the Vita filesystem supports emojis. Don't use them...
+
+### ```COOL_GAME2``` Halt Point Example
+Following the same logic as above, we will walk through the configuration as ```COOL_GAME2``` context.
+
+First section is ```COOL_GAME```, not a match so we skip it.
+
+Second section is ```ALL```, so we load modules from it:
+ - ```ux0:/plugins/ingamemusic.suprx```
+
+Third section is ```KERNEL```, so we skip it.
+
+Fourth section is ```COOL_GAME``` again so we skip it.
+
+Fifth section is ```COOL_GAME2``` so we process it. This time we have a halt point so this will be the last section we process. Remember, the halt point ```!``` stops any further parsing. This section however has no modules, so nothing is loaded. A section with no modules is OK. In this case, the following ```ALL``` section breaks ```COOL_GAME2``` in our hypothetical world. By using the halt point correctly, a CFW packager can maximise compatibility whilst maintaining load ordering.
+
+Our final module loading list for ```COOL_GAME2```:
+ - ```ux0:/plugins/ingamemusic.suprx```
+
+# Building
+To build taihen-parser, you require CMake to generate the appropriate build scripts.
+From within the repository directory:
+```sh
+$ mkdir build && cd build
+$ cmake ..
+$ make
+```
+
+To build the included tests you require the boost ```unit_test_framework``` installed. Then instead use:
+```sh
+$ mkdir build && cd build
+$ cmake -DTEST=ON ..
+$ make
+```
+
+# Installation
+To install to a specified location define ```CMAKE_INSTALL_PREFIX```:
+```sh
+$ mkdir build && cd build
+$ cmake -DCMAKE_INSTALL_PREFIX=/my/install/location ..
+$ make
+$ make install
+```
+
+# Acknowledgements
+Team molecule for HENkaku, Yifan Lu for taiHEN and xyz for immense support of the vitasdk.
+
+## License
+taihen-parser is licensed under the terms of the MIT license which can be read in the ```LICENSE``` file in the root of the repository.
+(C) 2016 David "Davee" Morgan
diff --git a/include/taihen/lexer.h b/include/taihen/lexer.h
new file mode 100644
index 0000000..c71923c
--- /dev/null
+++ b/include/taihen/lexer.h
@@ -0,0 +1,36 @@
+#ifndef LEXER_H
+#define LEXER_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define CONFIG_MAX_LINE_LENGTH (256)
+
+typedef enum
+{
+    CONFIG_START_TOKEN,
+    CONFIG_END_TOKEN,
+    CONFIG_COMMENT_TOKEN,
+    CONFIG_SECTION_TOKEN,
+    CONFIG_SECTION_HALT_TOKEN,
+    CONFIG_SECTION_NAME_TOKEN,
+    CONFIG_PATH_TOKEN
+} taihen_config_lexer_token;
+
+typedef struct
+{
+    const char *input;
+    const char *end;
+    taihen_config_lexer_token token;
+    char line[CONFIG_MAX_LINE_LENGTH];
+    char *line_pos;
+} taihen_config_lexer;
+
+int taihen_config_init_lexer(taihen_config_lexer *ctx, const char *input);
+int taihen_config_lex(taihen_config_lexer *ctx);
+
+#ifdef __cplusplus
+}
+#endif
+#endif // LEXER_H
diff --git a/include/taihen/parser.h b/include/taihen/parser.h
new file mode 100644
index 0000000..e04561a
--- /dev/null
+++ b/include/taihen/parser.h
@@ -0,0 +1,16 @@
+#ifndef PARSER_H
+#define PARSER_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef void (* taihen_config_handler)(const char *module, void *param);
+
+int taihen_config_validate(const char *input);
+void taihen_config_parse(const char *input, const char *section, taihen_config_handler handler, void *param);
+
+#ifdef __cplusplus
+}
+#endif
+#endif // PARSER_H
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
new file mode 100644
index 0000000..12cf0ce
--- /dev/null
+++ b/src/CMakeLists.txt
@@ -0,0 +1,10 @@
+if (MSVC)
+    add_definitions(-Dinline=__inline)
+endif()
+
+add_library(taihenconfig lexer.c parser.c)
+
+install(TARGETS taihenconfig
+    RUNTIME DESTINATION bin
+    LIBRARY DESTINATION lib
+    ARCHIVE DESTINATION lib)
diff --git a/src/lexer.c b/src/lexer.c
new file mode 100644
index 0000000..e9ee49c
--- /dev/null
+++ b/src/lexer.c
@@ -0,0 +1,209 @@
+/*
+ * lexer.c - tokenisation algorithm for taihen configuration files
+ *
+ * Copyright (C) 2016 David "Davee" Morgan
+ *
+ * This software may be modified and distributed under the terms
+ * of the MIT license.  See the LICENSE file for details.
+ */
+
+#include <taihen/lexer.h>
+
+#include <string.h>
+#include <ctype.h>
+
+static const char TOKEN_EMPTY = '\0';
+static const char TOKEN_COMMENT_START = '#';
+static const char TOKEN_SECTION_START = '*';
+static const char TOKEN_HALT = '!';
+
+static char *skip_whitespace(char *input)
+{
+    while (isspace((unsigned char)*input))
+    {
+        ++input;
+    }
+
+    return input;
+}
+
+static void trim_whitespace(char *input)
+{
+    char *end = input + strlen(input)-1;
+
+    while (end > input)
+    {
+        if (!isspace((unsigned char)*end))
+        {
+            break;
+        }
+
+        *end = '\0';
+        end--;
+    }
+}
+
+static const char *get_newline(const char *input)
+{
+    while (*input)
+    {
+        if (*input == '\r' || *input == '\n')
+        {
+            break;
+        }
+
+        ++input;
+    }
+
+    return input;
+}
+
+static int lex_line(taihen_config_lexer *ctx)
+{
+    if (ctx->input >= ctx->end)
+    {
+        ctx->token = CONFIG_END_TOKEN;
+        return 0;
+    }
+
+    const char *line_end = get_newline(ctx->input);
+    size_t len = line_end - ctx->input;
+
+
+    // check our line can fit in our buffer
+    if (len >= CONFIG_MAX_LINE_LENGTH)
+    {
+        return -1;
+    }
+
+    // copy line to our buffer so we can modify it
+    memcpy(ctx->line, ctx->input, len);
+    ctx->line[len] = '\0';
+    ctx->line_pos = ctx->line;
+    ctx->input = line_end+1;
+
+    // remove leading whitespace
+    ctx->line_pos = skip_whitespace(ctx->line_pos);
+
+    // check for empty line or comment
+    if (*ctx->line_pos == TOKEN_EMPTY || *ctx->line_pos == TOKEN_COMMENT_START)
+    {
+        ctx->token = CONFIG_COMMENT_TOKEN;
+        return 1;
+    }
+
+    // remove any trailing whitespace
+    trim_whitespace(ctx->line_pos);
+
+    // check if our line is empty now
+    if (*ctx->line_pos == TOKEN_EMPTY)
+    {
+        ctx->token = CONFIG_COMMENT_TOKEN;
+        return 1;
+    }
+
+    // check for section start
+    if (*ctx->line_pos == TOKEN_SECTION_START)
+    {
+        ctx->token = CONFIG_SECTION_TOKEN;
+    }
+    else
+    {
+        // should be a path
+        ctx->token = CONFIG_PATH_TOKEN;
+    }
+
+    return 1;
+}
+
+static int lex_section_halt(taihen_config_lexer *ctx)
+{
+    // skip more whitespace
+    ctx->line_pos = skip_whitespace(ctx->line_pos+1);
+
+    // check for halt token
+    if (*ctx->line_pos == TOKEN_HALT)
+    {
+        ctx->token = CONFIG_SECTION_HALT_TOKEN;
+    }
+    else
+    {
+        // should be a name
+        ctx->token = CONFIG_SECTION_NAME_TOKEN;
+    }
+
+    return 1;
+}
+
+static int lex_section_name(taihen_config_lexer *ctx)
+{
+    // skip more whitespace
+    ctx->line_pos = skip_whitespace(ctx->line_pos+1);
+
+    // should be a name
+    ctx->token = CONFIG_SECTION_NAME_TOKEN;
+    return 1;
+}
+
+/*!
+    \brief Initialise or reset lexer context.
+
+    taihen_config_init_lexer will init/reset the provided taihen_config_lexer and assign the
+    provided input to the context.
+
+    \param ctx      A non-null pointer to a context to initialise or reset.
+    \param input    A non-null UTF-8 encoded null-terminated string to tokenise.
+    \return zero on success, < 0 on error.
+ */
+int taihen_config_init_lexer(taihen_config_lexer *ctx, const char *input)
+{
+    if (ctx == NULL || input == NULL)
+    {
+        return -1;
+    }
+
+    // reset everything to default and reset input/end pointer
+    memset(ctx, 0, sizeof(taihen_config_lexer));
+    ctx->token = CONFIG_START_TOKEN;
+    ctx->input = input;
+    ctx->end = input + strlen(input);
+    return 0;
+}
+
+/*!
+    \brief Retrieve the next lexer token.
+
+    taihen_config_lex will accept an initialised context and provide the next token
+    in the stream. This tokenisation does no checking on formatting and as such does not
+    confirm that the document provided is well-formed.
+
+    \param ctx  A non-null point to an initialised context.
+    \return 0 if there are no further tokens, > 0 if there are further tokens else < 0 on error.
+    \sa taihen_config_init_lexer
+ */
+int taihen_config_lex(taihen_config_lexer *ctx)
+{
+    if (ctx == NULL)
+    {
+        return -1;
+    }
+
+    switch (ctx->token)
+    {
+    case CONFIG_START_TOKEN:
+    case CONFIG_COMMENT_TOKEN:
+    case CONFIG_PATH_TOKEN:
+    case CONFIG_SECTION_NAME_TOKEN:
+        return lex_line(ctx);
+
+    case CONFIG_SECTION_TOKEN:
+        return lex_section_halt(ctx);
+
+    case CONFIG_SECTION_HALT_TOKEN:
+        return lex_section_name(ctx);
+
+    case CONFIG_END_TOKEN:
+    default:
+        return -1;
+    }
+}
diff --git a/src/parser.c b/src/parser.c
new file mode 100644
index 0000000..800a987
--- /dev/null
+++ b/src/parser.c
@@ -0,0 +1,249 @@
+/*
+ * parser.c - parser algorithm for taihen configuration files
+ *
+ * Copyright (C) 2016 David "Davee" Morgan
+ *
+ * This software may be modified and distributed under the terms
+ * of the MIT license.  See the LICENSE file for details.
+ */
+
+#include <taihen/parser.h>
+#include <taihen/lexer.h>
+
+#include <string.h>
+
+static const char *TOKEN_ALL_SECTION = "ALL";
+static const char *TOKEN_KERNEL_SECTION = "KERNEL";
+
+static inline int is_continuation_byte(char b)
+{
+    return ((b & 0xC0) == 0x80);
+}
+
+static inline int check_continuation_bytes(const char *start, const char *end, int len)
+{
+    if ((end - start) < len)
+    {
+        return 0;
+    }
+
+    for (int i = 0; i < len; ++i)
+    {
+        if (!is_continuation_byte(start[i]))
+        {
+            return 0;
+        }
+    }
+
+    return 1;
+}
+
+static int check_utf8_sequence(const char *str, const char *end, unsigned char mask, unsigned char lead, int cont_len)
+{
+    if ((*str & mask) == lead)
+    {
+        if (check_continuation_bytes(str+1, end, cont_len))
+        {
+            return -1;
+        }
+
+        return 1;
+    }
+
+    return 0;
+}
+
+static int check_utf8(const char *str)
+{
+    struct
+    {
+        unsigned char mask;
+        unsigned char lead;
+        unsigned char cont_len;
+    } utf8_lut[4] =
+    {
+        { 0x80, 0x00, 0 }, // U+0000 -> U+007F, 0xxxxxx
+        { 0xE0, 0xC0, 1 }, // U+0080 -> U+07FF, 110xxxxx
+        { 0xF0, 0xE0, 2 }, // U+0800 -> U+FFFF, 1110xxxx
+        { 0xF8, 0xF0, 3 }, // U+10000 -> U+10FFFF, 11110xxx
+    };
+
+    const char *end = str + strlen(str);
+
+    while (str < end)
+    {
+        int i = 0;
+
+        for (i = 0; i < 4; ++i)
+        {
+            int res = check_utf8_sequence(str, end, utf8_lut[i].mask, utf8_lut[i].lead, utf8_lut[i].cont_len);
+
+            // check if valid sequence but incorrect contiunation
+            if (res < 0)
+            {
+                return 0;
+            }
+
+            // check if valid sequence
+            if (res > 0)
+            {
+                str += utf8_lut[i].cont_len+1;
+                break;
+            }
+        }
+
+        // check if we had no valid sequences
+        if (i == 4)
+        {
+            return 0;
+        }
+    }
+
+    return 1;
+}
+
+/*!
+    \brief Check whether a configuration is valid syntax.
+
+    taihen_config_validate is used to check whether a provided configuration is valid syntax.
+    This is useful when used before taihen_config_parse to provide error checking before stream based
+    parsing.
+
+    \param input A UTF-8 encoded null-terminated string containing the configuration to check.
+    \return non-zero on valid configuration, else zero on invalid.
+    \sa taihen_config_parse
+ */
+int taihen_config_validate(const char *input)
+{
+    taihen_config_lexer ctx;
+    taihen_config_init_lexer(&ctx, input);
+
+    int have_section = 0;
+    int lex_result = 0;
+
+    while ((lex_result = taihen_config_lex(&ctx)) > 0)
+    {
+        switch (ctx.token)
+        {
+        case CONFIG_SECTION_NAME_TOKEN:
+            // ensure we actually have a string
+            if (strlen(ctx.line_pos) == 0)
+            {
+                return 0;
+            }
+
+            // validate it is UTF-8
+            if (!check_utf8(ctx.line_pos))
+            {
+                return 0;
+            }
+
+            have_section = 1;
+            break;
+
+        case CONFIG_PATH_TOKEN:
+            if (!have_section)
+            {
+                // paths must belong to a section
+                return 0;
+            }
+
+            // ensure we actually have a string
+            if (strlen(ctx.line_pos) == 0)
+            {
+                return 0;
+            }
+
+            // validate it is UTF-8
+            if (!check_utf8(ctx.line_pos))
+            {
+                return 0;
+            }
+
+            break;
+
+        // ignore these, nothing to check
+        case CONFIG_SECTION_HALT_TOKEN:
+        case CONFIG_COMMENT_TOKEN:
+        case CONFIG_SECTION_TOKEN:
+        case CONFIG_END_TOKEN:
+            break;
+
+        // unexpected tokens, invalid document
+        default:
+            return 0;
+        }
+    }
+
+    return (lex_result == 0);
+}
+
+/*!
+    \brief taihen_config_parse parses a configuration for contextualised paths.
+
+    taihen_config_parse is used to obtain an ordered stream of the paths appropriate for the section provided.
+    Special sections such as ALL and KERNEL will be taken into consideration when generating the stream.
+
+    taihen_config_parse provides no error checking or handling. Use taihen_config_validate before parsing the
+    document to avoid errors in parsing.
+
+   \param input     A UTF-8 encoded null-terminated string containing the configuration to parse.
+   \param section   A UTF-8 encoded null-terminated string containing the section to base context from.
+   \param handler   A taihen_config_handler to receive the stream of paths.
+   \param param     A user provided value that is passed to the provided taihen_config_handler.
+   \sa taihen_config_validate
+ */
+void taihen_config_parse(const char *input, const char *section, taihen_config_handler handler, void *param)
+{
+    taihen_config_lexer ctx;
+    taihen_config_init_lexer(&ctx, input);
+
+    int halt_flag = 0;
+    int record_entries = 0;
+
+    while (taihen_config_lex(&ctx) > 0)
+    {
+        switch (ctx.token)
+        {
+        case CONFIG_SECTION_HALT_TOKEN:
+            halt_flag = 1;
+            break;
+
+        case CONFIG_SECTION_NAME_TOKEN:
+            if (strcmp(ctx.line_pos, TOKEN_ALL_SECTION) == 0 && strcmp(section, TOKEN_KERNEL_SECTION) != 0)
+            {
+                record_entries = 1;
+            }
+            else if (strcmp(section, ctx.line_pos) == 0)
+            {
+                record_entries = 1;
+            }
+            else
+            {
+                record_entries = 0;
+            }
+
+            break;
+
+        case CONFIG_SECTION_TOKEN:
+            if (record_entries && halt_flag)
+            {
+                return;
+            }
+
+            halt_flag = 0;
+            break;
+
+        case CONFIG_PATH_TOKEN:
+            if (record_entries)
+            {
+                handler(ctx.line_pos, param);
+            }
+
+            break;
+
+        default:
+            break;
+        }
+    }
+}
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
new file mode 100644
index 0000000..5273969
--- /dev/null
+++ b/test/CMakeLists.txt
@@ -0,0 +1,10 @@
+find_package(Boost COMPONENTS unit_test_framework REQUIRED)
+
+include_directories(${taihen-config_SOURCE_DIR}/src ${Boost_INCLUDE_DIRS})
+link_directories(${Boost_LIBRARY_DIRS})
+
+add_executable(test-lexer test_lexer.cpp)
+target_link_libraries(test-lexer taihenconfig)
+
+add_executable(test-parser test_parser.cpp)
+target_link_libraries(test-parser taihenconfig)
diff --git a/test/test_lexer.cpp b/test/test_lexer.cpp
new file mode 100644
index 0000000..09027ea
--- /dev/null
+++ b/test/test_lexer.cpp
@@ -0,0 +1,208 @@
+#include <taihen/lexer.h>
+
+#define BOOST_TEST_MODULE lexer
+#include <boost/test/unit_test.hpp>
+
+#include <random>
+#include <iomanip>
+
+BOOST_AUTO_TEST_CASE(init_lexer)
+{
+    const char *input = "";
+    taihen_config_lexer ctx;
+
+    // test NULL parameter handling
+    BOOST_REQUIRE_LT(taihen_config_init_lexer(NULL, NULL), 0);
+    BOOST_REQUIRE_LT(taihen_config_init_lexer(&ctx, NULL), 0);
+    BOOST_REQUIRE_LT(taihen_config_init_lexer(NULL, input), 0);
+
+    // test correct input
+    BOOST_REQUIRE_GE(taihen_config_init_lexer(&ctx, input), 0);
+}
+
+BOOST_AUTO_TEST_CASE(empty_lex)
+{
+    const char *input = "";
+    taihen_config_lexer ctx;
+
+    BOOST_WARN_GE(taihen_config_init_lexer(&ctx, input), 0);
+
+    // we should expect immediate end of stream
+    BOOST_REQUIRE_EQUAL(taihen_config_lex(&ctx), 0);
+    BOOST_REQUIRE_EQUAL(ctx.token, CONFIG_END_TOKEN);
+}
+
+BOOST_AUTO_TEST_CASE(reset_lexer)
+{
+    const char *input = "";
+    taihen_config_lexer ctx;
+
+    BOOST_WARN_GE(taihen_config_init_lexer(&ctx, input), 0);
+
+    // we should expect immediate end of stream
+    BOOST_WARN_EQUAL(taihen_config_lex(&ctx), 0);
+    BOOST_WARN_EQUAL(ctx.token, CONFIG_END_TOKEN);
+
+    // reset the lexer
+    BOOST_REQUIRE_GE(taihen_config_init_lexer(&ctx, input), 0);
+    BOOST_REQUIRE_EQUAL(ctx.token, CONFIG_START_TOKEN);
+}
+
+BOOST_AUTO_TEST_CASE(simple_section_lex)
+{
+    const char *input = "*MY SECTION";
+    taihen_config_lexer ctx;
+
+    BOOST_WARN_GE(taihen_config_init_lexer(&ctx, input), 0);
+
+    // we should expect section token
+    BOOST_REQUIRE_GT(taihen_config_lex(&ctx), 0);
+    BOOST_REQUIRE_EQUAL(ctx.token, CONFIG_SECTION_TOKEN);
+
+    // then we expect name
+    BOOST_REQUIRE_GT(taihen_config_lex(&ctx), 0);
+    BOOST_REQUIRE_EQUAL(ctx.token, CONFIG_SECTION_NAME_TOKEN);
+
+    // check name is still "MY SECTION"
+    BOOST_TEST(ctx.line_pos == "MY SECTION");
+
+    // then we expect end of stream
+    BOOST_REQUIRE_EQUAL(taihen_config_lex(&ctx), 0);
+    BOOST_REQUIRE_EQUAL(ctx.token, CONFIG_END_TOKEN);
+}
+
+BOOST_AUTO_TEST_CASE(complex_section_lex)
+{
+    const char *input = "*!MY SECTION";
+    taihen_config_lexer ctx;
+
+    BOOST_WARN_GE(taihen_config_init_lexer(&ctx, input), 0);
+
+    // we should expect section token
+    BOOST_REQUIRE_GT(taihen_config_lex(&ctx), 0);
+    BOOST_REQUIRE_EQUAL(ctx.token, CONFIG_SECTION_TOKEN);
+
+    // we should expect section halt token
+    BOOST_REQUIRE_GT(taihen_config_lex(&ctx), 0);
+    BOOST_REQUIRE_EQUAL(ctx.token, CONFIG_SECTION_HALT_TOKEN);
+
+    // then we expect name
+    BOOST_REQUIRE_GT(taihen_config_lex(&ctx), 0);
+    BOOST_REQUIRE_EQUAL(ctx.token, CONFIG_SECTION_NAME_TOKEN);
+
+    // check name is still "MY SECTION"
+    BOOST_TEST(ctx.line_pos == "MY SECTION");
+
+    // then we expect end of stream
+    BOOST_REQUIRE_EQUAL(taihen_config_lex(&ctx), 0);
+    BOOST_REQUIRE_EQUAL(ctx.token, CONFIG_END_TOKEN);
+}
+
+
+BOOST_AUTO_TEST_CASE(whitespace_lex)
+{
+    const char *input = "\t\t    \t\t";
+    taihen_config_lexer ctx;
+
+    BOOST_WARN_GE(taihen_config_init_lexer(&ctx, input), 0);
+
+    // we should expect comment token
+    BOOST_REQUIRE_GT(taihen_config_lex(&ctx), 0);
+    BOOST_REQUIRE_EQUAL(ctx.token, CONFIG_COMMENT_TOKEN);
+
+    BOOST_REQUIRE_EQUAL(taihen_config_lex(&ctx), 0);
+    BOOST_REQUIRE_EQUAL(ctx.token, CONFIG_END_TOKEN);
+}
+
+BOOST_AUTO_TEST_CASE(comment_lex)
+{
+    const char *input = "#this is a comment";
+    taihen_config_lexer ctx;
+
+    BOOST_WARN_GE(taihen_config_init_lexer(&ctx, input), 0);
+
+    // we should expect comment token
+    BOOST_REQUIRE_GT(taihen_config_lex(&ctx), 0);
+    BOOST_REQUIRE_EQUAL(ctx.token, CONFIG_COMMENT_TOKEN);
+
+    BOOST_REQUIRE_EQUAL(taihen_config_lex(&ctx), 0);
+    BOOST_REQUIRE_EQUAL(ctx.token, CONFIG_END_TOKEN);
+}
+
+BOOST_AUTO_TEST_CASE(path_lex)
+{
+    const char *input = "this:/is/a/path";
+    taihen_config_lexer ctx;
+
+    BOOST_WARN_GE(taihen_config_init_lexer(&ctx, input), 0);
+
+    // we should expect path token, this isnt valid config syntax
+    // but its not lexer job to ensure its correct order
+    // it just tokenises the input
+    BOOST_REQUIRE_GT(taihen_config_lex(&ctx), 0);
+    BOOST_REQUIRE_EQUAL(ctx.token, CONFIG_PATH_TOKEN);
+
+    BOOST_REQUIRE_EQUAL(taihen_config_lex(&ctx), 0);
+    BOOST_REQUIRE_EQUAL(ctx.token, CONFIG_END_TOKEN);
+}
+
+BOOST_AUTO_TEST_CASE(random_lex)
+{
+    std::random_device seed;
+    std::mt19937_64 mt;
+    std::vector<unsigned char> line(255);
+    taihen_config_lexer ctx;
+
+    // seed mt from random device
+    mt.seed(seed());
+
+    for (auto i = 0; i < 100000; ++i)
+    {
+        std::generate(line.begin(), std::prev(line.end()), mt);
+
+        line[254] = '\0';
+
+        BOOST_WARN_GE(taihen_config_init_lexer(&ctx, (char *)(line.data())), 0);
+
+        while(1)
+        {
+            int res = taihen_config_lex(&ctx);
+
+            if (res < 0)
+            {
+                std::stringstream ss;
+
+                ss << "on generated data: " << std::hex << std::setfill('0');
+
+                std::for_each(line.begin(), line.end(), [&ss](auto& v)
+                {
+                    ss << std::setw(2) << static_cast<unsigned>(v);
+                });
+
+                ss << std::endl;
+
+
+                BOOST_TEST_REQUIRE(res >= 0, ss.str());
+            }
+
+            if (res == 0)
+            {
+                break;
+            }
+        }
+    }
+
+    BOOST_REQUIRE_EQUAL(ctx.token, CONFIG_END_TOKEN);
+}
+
+BOOST_AUTO_TEST_CASE(long_line_lex)
+{
+    char line[CONFIG_MAX_LINE_LENGTH+1];
+    taihen_config_lexer ctx;
+
+    std::memset(line, 'a', sizeof(line));
+    line[CONFIG_MAX_LINE_LENGTH] = '\0';
+
+    BOOST_REQUIRE_GE(taihen_config_init_lexer(&ctx, line), 0);
+    BOOST_REQUIRE_LT(taihen_config_lex(&ctx), 0);
+}
diff --git a/test/test_parser.cpp b/test/test_parser.cpp
new file mode 100644
index 0000000..1df51af
--- /dev/null
+++ b/test/test_parser.cpp
@@ -0,0 +1,9 @@
+#include <taihen/parser.h>
+
+#define BOOST_TEST_MODULE parser
+#include <boost/test/unit_test.hpp>
+
+BOOST_AUTO_TEST_CASE(removed_for_now)
+{
+
+}
-- 
2.39.5