{ This example unit shows how to implement a custom FTS2 tokenizer
  with DISQLite3.

  This tokenizer uses proper Pascal delimiters (the '_' underscore is NOT a
  delimiter), decodes UTF-8 input, and outputs UTF-8 tokens.

  Visit the DISQLite3 Internet site for latest information and updates:

    http://www.yunqa.de/delphi/

  Copyright (c) 2005-2009 Ralf Junker, The Delphi Inspiration <delphi@yunqa.de>

------------------------------------------------------------------------------ }

unit DISQLite3PascalTokenizer;

{$I DI.inc}
{$I DISQLite3.inc}

interface

uses
  DISystemCompat, DISQLite3Api;

{ Returns a pointer to the Pascal Tokenizer Module. }

function sqlite3Fts2PascalTokenizerModule: Psqlite3_tokenizer_module;

implementation

{$IFDEF DIH_HAVE_DILibC}
uses
  DILibC;
{$ENDIF DIH_HAVE_DILibC}

type

  { Structures used by the tokenizer interface. When a new tokenizer
    implementation is registered, the caller provides a pointer to
    an sqlite3_tokenizer_module containing pointers to the callback
    functions that make up an implementation.

    When an fts2 table is created, it passes any arguments passed to
    the tokenizer clause of the CREATE VIRTUAL TABLE statement to the
    sqlite3_tokenizer_module.xCreate() function of the requested tokenizer
    implementation. The xCreate() function in turn returns an
    sqlite3_tokenizer structure representing the specific tokenizer to
    be used for the fts2 table (customized by the tokenizer clause arguments).

    To tokenize an input buffer, the sqlite3_tokenizer_module.xOpen()
    method is called. It returns an sqlite3_tokenizer_cursor object
    that may be used to tokenize a specific input buffer based on
    the tokenization rules supplied by a specific sqlite3_tokenizer
    object. }

  { }
  Tpascal_tokenizer = packed record
    Base: Tsqlite3_tokenizer;
  end;
  Ppascal_tokenizer = ^Tpascal_tokenizer;

  { }
  Tpascal_tokenizer_cursor = packed record
    Base: Tsqlite3_tokenizer_cursor;
    pInput: PUtf8Char; // input where we are tokenizing.
    pPos: PUtf8Char; // current position.
    nBytes: Cardinal; // input remaining at pPos.
    iOffset: Integer; // current position in pInput.
    iToken: Integer; // index of next token to be returned.
    pToken: PUtf8Char; // storage for current token.
    nTokenAllocated: Integer; // space allocated to zToken buffer.
  end;
  Ppascal_tokenizer_cursor = ^Tpascal_tokenizer_cursor;

  //------------------------------------------------------------------------------

{ Create a new tokenizer. The values in the argv[] array are the
  arguments passed to the "tokenizer" clause of the CREATE VIRTUAL
  TABLE statement that created the fts2 table. For example, if
  the following SQL is executed:

    CREATE .. USING fts2( ... , tokenizer <tokenizer-name> arg1 arg2);

  then argc is set to 2, and the argv[] array contains pointers
  to the strings "arg1" and "arg2".

  This method should return either SQLITE_OK, or an SQLite error
  code. If SQLITE_OK is returned, then ppTokenizer should be set
  to point at the newly created tokenizer structure. The generic
  sqlite3_tokenizer.pModule variable should not be initialised by
  this callback. The caller will do so. }

function pascal_tokenizer_Create(
  argc: Integer; // Number of entries in argv[].
  const argv: PPUtf8CharArray; // Tokenizer creation arguments.
  ppTokenizer: PPsqlite3_tokenizer // OUT: Created tokenizer.
  ): Integer;
var
  t: Ppascal_tokenizer;
begin
  GetMem(t, SizeOf(t^));
  ppTokenizer^ := Pointer(t);
  Result := SQLITE_OK;
end;

//------------------------------------------------------------------------------

{ Destroy an existing tokenizer. The fts2 module calls this method
  exactly once for each successful call to xCreate(). }

function pascal_tokenizer_destroy(
  pTokenizer: Psqlite3_tokenizer): Integer;
begin
  FreeMem(pTokenizer);
  Result := SQLITE_OK;
end;

//------------------------------------------------------------------------------

{ Create a tokenizer cursor to tokenize an input buffer. The caller
  is responsible for ensuring that the input buffer remains valid
  until the cursor is closed (using the xClose() method). }

function pascal_tokenizer_Open(
  pTokenizer: Psqlite3_tokenizer; // The tokenizer
  const pInput: PUtf8Char; // Input string
  nBytes: Integer; // Length of pInput in bytes
  ppCursor: PPsqlite3_tokenizer_cursor // OUT: Tokenization cursor
  ): Integer;
var
  c: Ppascal_tokenizer_cursor;
begin
  GetMem(c, SizeOf(c^));
  c^.pInput := pInput;
  c^.pPos := pInput;
  if Assigned(pInput) then
    if nBytes < 0 then
      c^.nBytes := {$IFDEF DIH_HAVE_DILibC}DILibC{$ELSE}DISQLite3Api{$ENDIF}.StrLen(pInput)
    else
      c^.nBytes := nBytes
  else
    c^.nBytes := 0;

  c^.iToken := 0;
  c^.pToken := nil; // No space allocated, yet.
  c^.nTokenAllocated := 0;

  ppCursor^ := Pointer(c);
  Result := SQLITE_OK;
end;

//------------------------------------------------------------------------------

{ Destroy an existing tokenizer cursor. The fts2 module calls this
  method exactly once for each successful call to xOpen(). }

function pascal_tokenizer_Close(
  pCursor: Psqlite3_tokenizer_cursor): Integer;
var
  c: Ppascal_tokenizer_cursor;
begin
  c := Pointer(pCursor);
  FreeMem(c^.pToken);
  FreeMem(c);
  Result := SQLITE_OK;
end;

//------------------------------------------------------------------------------

{ Helper Function: Determine if a Unicode Code Point qualifies as a Pascal
  delimiter. Handles Latin 1 (ISO-8859-1) characters only (for size). }

function pascal_Delim(const c: Cardinal): Boolean;
begin
  case c of
    $00..$2F, $3A..$3F, $5B..$5E, $60, $7B..$BF, $D7, $F7:
      Result := True
  else
    Result := False;
  end;
end;

//------------------------------------------------------------------------------

{ Helper Function: Return the lower case equivalent of a given Unicode Code
  Point. Handles Latin 1 (ISO-8859-1) characters only (for size). }

function pascal_Lower(const c: Cardinal): Cardinal;
begin
  Result := c;
  case Result of
    $41..$5A, $C0..$CF, $D1..$D6, $D8..$DE: Inc(Result, $20);
  end;
end;

//------------------------------------------------------------------------------

{ Retrieve the next token from the tokenizer cursor pCursor. This
  method should either return SQLITE_OK and set the values of the
  "OUT" variables identified below, or SQLITE_DONE to indicate that
  the end of the buffer has been reached, or an SQLite error code.

  ppToken should be set to point at a buffer containing the
  normalized version of the token (i.e. after any case-folding and/or
  stemming has been performed). pnBytes should be set to the length
  of this buffer in bytes. The input text that generated the token is
  identified by the byte offsets returned in piStartOffset and
  piEndOffset.

  The buffer ppToken is set to point at is managed by the tokenizer
  implementation. It is only required to be valid until the next call
  to xNext() or xClose(). }

function pascal_tokenizer_Next(
  pCursor: Psqlite3_tokenizer_cursor; // Cursor returned by pascal_tokenizer_Open.
  const ppToken: PPUtf8Char; // OUT: ppToken is the token text.
  pnBytes: PInteger; // OUT: Number of bytes in token.
  piStartOffset: PInteger; // OUT: Starting offset of token.
  piEndOffset: PInteger; // OUT: Ending offset of token.
  piPosition: PInteger): Integer; // OUT: Position integer of token.
label
  lblToken;
var
  c: Ppascal_tokenizer_cursor;
  Ch: Cardinal;
  pStart, TokenPtr: PUtf8Char;
  l, TokenLen: Integer;
begin
  c := Pointer(pCursor);

  // Find the first non-delimiter character     .
  while c^.nBytes > 0 do
    begin
      l := sqlite3_read_utf8(c^.pPos, c^.nBytes, Ch);
      if not pascal_Delim(Ch) then
        goto lblToken;
      Inc(c^.pPos, l); Dec(c^.nBytes, l);
    end;

  Result := SQLITE_DONE;
  Exit;

  l := 0; // Silence compiler warning.

  lblToken:
  { At this point, Ch holds a non-delimiter token character. Write it to the
    token buffer and continue reading the token up to the next delimiter or
    until all input is consumed. }
  pStart := c^.pPos;
  TokenPtr := c^.pToken;
  TokenLen := c^.nTokenAllocated;

  repeat
    Inc(c^.pPos, l); Dec(c^.nBytes, l);

    { Make sure the token buffer is large enough to a token character.
      UTF-8 writes at most 6 bytes. }
    if TokenLen < 6 then
      begin
        Inc(c^.nTokenAllocated, $20);
        ReallocMem(c^.pToken, c^.nTokenAllocated);
        TokenPtr := c^.pToken + TokenLen;
        TokenLen := c^.nTokenAllocated - TokenLen;
      end;

    Ch := pascal_Lower(Ch);
    l := sqlite3_write_utf8(Ch, TokenPtr);
    Inc(TokenPtr, l); Dec(TokenLen, l);

    if c^.nBytes <= 0 then
      Break;
    l := sqlite3_read_utf8(c^.pPos, c^.nBytes, Ch);
  until pascal_Delim(Ch);

  ppToken^ := c^.pToken;
  pnBytes^ := TokenPtr - c^.pToken;

  piStartOffset^ := pStart - c^.pInput;
  piEndOffset^ := c^.pPos - c^.pInput;

  piPosition^ := c^.iToken;
  Inc(c^.iToken);
  Result := SQLITE_OK;
end;

//------------------------------------------------------------------------------

const
  pascal_tokenizer: Tsqlite3_tokenizer_module = (
    iVersion: 0;
    xCreate: pascal_tokenizer_Create;
    xDestroy: pascal_tokenizer_destroy;
    xOpen: pascal_tokenizer_Open;
    xClose: pascal_tokenizer_Close;
    xNext: pascal_tokenizer_Next);

function sqlite3Fts2PascalTokenizerModule: Psqlite3_tokenizer_module;
begin
  Result := @pascal_tokenizer;
end;

end.

