Thursday, March 7, 2013

Psuedo strtok in SPL


I needed a way to extract the individual words from a sentence stored in a single character field. After some failed google searches and no desire to install a Datablade or write a C UDR for something that doesn't need to have killer performance, I decided to write my own quick and dirty SPL function.

my_strtok(str, delim, token_num) will take a string, break it into individual tokens based the delimiter and return the Nth token of the string.

Running this:

execute function my_strtok("How now brown cow", " ", 3)

Would return the third token:

brown

Here is the code for my_strtok(), comments welcome on anything I might have missed in the logic. And when I say it is slow, I just mean it could be done in a different way and perform more efficiently, but for what I needed it works.

create function my_strtok (str lvarchar(2048), delim char(1), token_num smallint)
returning lvarchar(2048) as token;

        define str_len integer;
        define start_pos integer;
        define stop_pos integer;
        define cur_token_num integer;


        -- initialize start position and current token number to 1
        let start_pos = 1;
        let cur_token_num = 1;

        -- remove any leading delimiters from the input string
        let str = ltrim(str, delim);

        -- save the input string length so we don't have to recalculate it later
        let str_len = length(str);


        -- find the start of the token we want to return

        -- while there is still more string available to process
        while (start_pos <= str_len)
                -- if the current token number is the token we want, stop looking
                -- for a start position
                if (cur_token_num = token_num) then
                        exit;
                end if;

                -- increment the start position to the next character
                let start_pos = start_pos + 1;

                -- check to see if the current character in the string is a delimiter
                if (substr(str, start_pos, 1) = delim) then
                        -- we have found the next token
                        let cur_token_num = cur_token_num + 1;

                        -- advance the token start position past any repeating delimiters
                        while (start_pos <= str_len)
                                let start_pos = start_pos + 1;

                                if (substr(str, start_pos, 1) != delim) then
                                        -- there are no more repeating delimiters
                                        -- stop looking for repeating delimiters
                                        exit;
                                end if;
                        end while;
                end if;
        end while;


        -- we now either have the start position of the token we are looking for
        -- or we did not find the token we were looking for
        -- if we did not find the token, return NULL
        -- if we did find the token we were looking for, find the end of the token

        if (cur_token_num = token_num) then
                -- we found the token
                let stop_pos = start_pos;

                -- while there is still string to process try to find the end of our token
                -- if we run out of string before we find the next delimiter then
                -- our token ends where the string ends
                while (stop_pos <= str_len)
                        let stop_pos = stop_pos + 1;

                        if (substr(str, stop_pos, 1) = delim) then
                                -- we found the end
                                let stop_pos = stop_pos - 1;
                                exit;
                        end if;
                end while;

                -- return the found token
                return substr(str, start_pos, stop_pos - start_pos + 1);
        else
                -- the token was not found
                return NULL;
        end if;
end function;

execute function my_strtok("Simple test", " ", 1);

token  Simple

1 row(s) retrieved.


execute function my_strtok("Simple test", " ", 2);

token  test

1 row(s) retrieved.


execute function my_strtok("    Leading delimiters", " ", 1);

token  Leading

1 row(s) retrieved.


execute function my_strtok("Repeating       delimiters", " ", 2);

token  delimiters

1 row(s) retrieved.


execute function my_strtok("Token not found", " ", 4);

token

1 row(s) retrieved.


execute function my_strtok("Should have checked for invalid input", " ", -1);

token

1 row(s) retrieved.


execute function my_strtok("Invalid input works, but is unecessarily slow", " ", -1000);

token

1 row(s) retrieved.


execute function my_strtok("Empty delimiter defaults to space, convenient", "", 6);

token  convenient

1 row(s) retrieved.

2 comments:

  1. This comment has been removed by a blog administrator.

    ReplyDelete
  2. i'm quite new to informix so i would like to know how could it have been more efficient. i only know basic sql statements and i'm eager to learn. thanks!

    ReplyDelete