We all know that operating on strings can be prohibitively expensive. In my free time, I am working on my own script language, and there are several places where strings are used: during the compilation of scripts, building the AST, code generation, and assembly generation, to name a few. To help speed up compilation, I knocked together this small class, which basically replaces string comparisons with integer comparisons. The class itself is nothing more than a glorified hash-table, that allows you to insert, find, and retrieve strings from this table.
#include <vector>
#include <assert.h>
typedef size_t IndexedString;
const IndexedString InvalidString = 0;
//
// The string table class. Keeps a list of shared static strings, which
// can be safely compared by using the IndexedString type.
// Invalid/empty string are zero
//
class StringTable
{
public:
StringTable()
: hash(0)
, hashCount(8)
{
Rehash();
// Reserve first slot
Insert("__reserved__");
}
~StringTable()
{
if( hash != 0 )
delete[] hash;
for(std::vector<Entry>::const_iterator it = strings.begin(); it != strings.end(); ++it)
delete[] it->string;
}
IndexedString Insert(const char *string)
{
if(string == 0 || strlen(string) == 0)
return InvalidString;
// See if it is already in the table
size_t hashVal = HashFunction(string) & (hashCount-1);
for( size_t i = hash[hashVal]; i != (size_t)-1; i = strings[i].hashNext )
{
if( strcmp(strings[i].string, string) == 0 )
return i;
}
// Add a new entry
Entry entry;
size_t index = strings.size();
entry.hashNext = hash[hashVal];
entry.string = _strdup(string);
hash[hashVal] = index;
strings.push_back(entry);
// Update hashtable if needed
if( hashCount*2+8 < (index+1) )
{
hashCount *= 2;
strings.reserve(hashCount);
Rehash();
}
assert( index == Find(string) );
return index;
}
IndexedString Find(const char *string)
{
if(string == 0 || strlen(string) == 0)
return InvalidString;
// See if it is already in the table
size_t hashVal = HashFunction(string) & (hashCount-1);
for( size_t i = hash[hashVal]; i != (size_t)-1; i = strings[i].hashNext )
{
if( strcmp(strings[i].string, string) == 0 )
return i;
}
return InvalidString;
}
// Resets the table
void Empty()
{
if( hash != 0 )
{
delete[] hash;
hash = 0;
hashCount = 8;
}
for(std::vector<Entry>::const_iterator it = strings.begin(); it != strings.end(); ++it)
delete[] it->string;
strings.clear();
}
std::string Get(IndexedString index) const
{
if(index >= strings.size() || index == InvalidString)
return std::string();
else
return strings[index].string;
}
protected:
void Rehash()
{
assert(!(hashCount&(hashCount-1)));
assert(hashCount>=8);
size_t* newHash = new size_t[hashCount+1];
memset(newHash, -1, sizeof(size_t)*hashCount);
{
for( size_t i=0; i<strings.size(); i++ )
{
Entry& entry = strings[i];
size_t hashVal = HashFunction(entry.string) & (hashCount-1);
entry.hashNext = newHash[hashVal];
newHash[hashVal] = i;
}
}
if( hash )
delete[] hash;
hash = newHash;
}
void Relax()
{
while( hashCount > (strings.size()*2+8) )
hashCount >>= 1;
Rehash();
}
size_t HashFunction(const char *s)
{
if(s == 0)
return 0;
size_t HashVal = 5381;
while(*s != 0)
{
HashVal = ((HashVal << 5) + HashVal) ^ *s;
s++;
}
return HashVal;
}
protected:
struct Entry
{
size_t hashNext;
const char *string;
};
size_t *hash;
size_t hashCount;
std::vector<Entry> strings;
};
Using the above class, you can now do string-comparisons as such:
int main(int argc, char* argv[])
{
StringTable st;
char *sz1 = "test string nr. 1";
char *sz2 = "test string nr. 2";
char *sz3 = "test string nr. 3";
char *sz4 = "test string nr. 4";
// Same strings, different addresses
char *sz1_dup = _strdup(sz1);
char *sz2_dup = _strdup(sz2);
char *sz3_dup = _strdup(sz3);
char *sz4_dup = _strdup(sz4);
// Insert strings into the table
IndexedString index1 = st.Insert(sz1);
IndexedString index2 = st.Insert(sz2);
IndexedString index3 = st.Insert(sz3);
IndexedString index4 = st.Insert(sz4);
// Validate correct comparisons
if(st.Find(sz1_dup) == index1) std::cout << "Nr. 1 is a match" << std::endl;
if(st.Find(sz2_dup) == index2) std::cout << "Nr. 2 is a match" << std::endl;
if(st.Find(sz3_dup) == index3) std::cout << "Nr. 3 is a match" << std::endl;
if(st.Find(sz4_dup) == index4) std::cout << "Nr. 4 is a match" << std::endl;
// Retrieve strings from table
std::cout << "Index 1 = " << st.Get(index1).c_str() << std::endl;
std::cout << "Index 2 = " << st.Get(index2).c_str() << std::endl;
std::cout << "Index 3 = " << st.Get(index3).c_str() << std::endl;
std::cout << "Index 4 = " << st.Get(index4).c_str() << std::endl;
std::cout << "Invalid index 1 = " << st.Get(0).c_str() << std::endl;
std::cout << "Invalid index 2 = " << st.Get(0x80000000).c_str() << std::endl;
// Free allocated data
delete [] sz1_dup;
delete [] sz2_dup;
delete [] sz3_dup;
delete [] sz4_dup;
return 0;
}












