An inverted index is an index data structure storing a mapping from content, such as words or numbers, to its locations in a database file, or in a document or a set of documents, in this case allowing full text search [more on wkipedia].
Lua extension
The inverted index for Tokyo Tyrant (and LightCloud) is hacked by Mikio Hirabayashi - the developer of Tokyo Tyrant:
--
-- Inverted index by the Lua extension of Tokyo Tyrant
--
-- constants
DELIMS = " \t\r\n" -- delimiters of tokenizing
LIMNUM = 2000 -- limit number of kept occurrence
DEFMAX = 10 -- default maximum number of search
-- call back function when starting
function _begin()
_log("Inverted index started")
end
-- call back function when ending
function _end()
_log("Inverted index finished")
end
-- register a text into the index
function put(id, text)
id = tonumber(id)
if not id or id < 1 then
return nil
end
if not text then
return nil
end
local tokens = _tokenize(text)
if math.random() < 5 / LIMNUM then
for i = 1, #tokens do
token = tokens[i]
if not _lock(token) then
_log("lock error")
return nil
end
local ids = {}
local idsel = _get(token)
if idsel then
ids = _unpack("w*", idsel)
end
local nids = {}
local top = #ids - LIMNUM + 2
if top < 1 then
top = 1
end
for j = top, #ids do
table.insert(nids, ids[j])
end
table.insert(nids, id)
idsel = _pack("w*", nids)
if not _put(token, idsel) then
_log("put error")
_unlock(token)
return nil
end
_unlock(token)
end
else
local idsel = _pack("w", id)
for i = 1, #tokens do
token = tokens[i]
if not _lock(token) then
_log("lock error")
return nil
end
if not _putcat(token, idsel) then
_log("putcat error")
_unlock(token)
return nil
end
_unlock(token)
end
end
return "ok"
end
-- remove a text from the index
function out(id, text)
id = tonumber(id)
if not id or id < 1 then
return nil
end
if not text then
return nil
end
local tokens = _tokenize(text)
for i = 1, #tokens do
token = tokens[i]
if not _lock(token) then
_log("lock error")
return nil
end
local ids = {}
local idsel = _get(token)
if idsel then
ids = _unpack("w*", idsel)
end
local nids = {}
for j = 0, #ids do
if ids[j] ~= id then
table.insert(nids, ids[j])
end
end
idsel = _pack("w*", nids)
if not _put(token, idsel) then
_log("put error")
_unlock(token)
return nil
end
_unlock(token)
end
return "ok"
end
-- replace the text
function replace(id, befaft)
id = tonumber(id)
if not id or id < 1 then
return nil
end
if not befaft then
return nil
end
local pivot = string.find(befaft, "\n", 1, true)
if not pivot then
return nil
end
local bef = string.sub(befaft, 1, pivot - 1)
local aft = string.sub(befaft, pivot + 1)
if not out(id, bef) then
return nil
end
if not put(id, aft) then
return nil
end
return "ok"
end
-- search the index with a phrase of intersection
function search(phrase, max)
if not phrase then
return nil
end
max = tonumber(max)
if not max or max < 0 then
max = DEFMAX
end
local tokens = _tokenize(phrase)
local tnum = #tokens
if tnum < 1 then
return "0\n"
end
local idsel = _get(tokens[1])
local result = _unpack("w*", idsel)
for i = 2, tnum do
idsel = _get(tokens[i])
local ids = _unpack("w*", idsel)
result = _isect(result, ids)
end
-- if tnum > 1 then
-- local rset = {}
-- table.insert(rset, result)
-- for i = 2, tnum do
-- idsel = _get(tokens[i])
-- result = _unpack("w*", idsel)
-- table.insert(rset, result)
-- end
-- result = _isect(rset)
-- end
table.sort(result)
local rtxt = #result .. "\n"
local bot = #result - max
if bot < 1 then
bot = 1
end
for i = #result, bot, -1 do
if max < 1 then
break
end
rtxt = rtxt .. result[i] .. "\n"
max = max - 1
end
return rtxt
end
-- break a text into an array of tokens
function _tokenize(text)
local tokens = {}
local uniq = {}
for token in string.gmatch(text, "[^" .. DELIMS .. "]+") do
if #token > 0 and not uniq[token] then
table.insert(tokens, token)
uniq[token] = true
end
end
return tokens
end
-- END OF FILE
