Algorithm Implementation/Trees/B+ tree
In computer science, a B+ tree is a type of tree data structure. It represents sorted data in a way that allows for efficient insertion and removal of elements. It is a dynamic, multilevel index with maximum and minimum bounds on the number of keys in each node.
A B+ tree is a variation on a B-tree. In a B+ tree, in contrast to a B tree, all data are saved in the leaves. Internal nodes contain only keys and tree pointers. All leaves are at the same lowest level. Leaf nodes are also linked together as a linked list to make range queries easy.
The maximum number of keys in a record is called the order of the B+ tree.
The minimum number of keys per record is 1/2 of the maximum number of keys. For example, if the order of a B+ tree is n, each node (except for the root) must have between n/2 and n keys.
The number of keys that may be indexed using a B+ tree is a function of the order of the tree and its height.
For a n-order B+ tree with a height of h:
- maximum number of keys is
- minimum number of keys is
The B+ tree was first described in the paper "Rudolf Bayer, Edward M. McCreight: Organization and Maintenance of Large Ordered Indices. Acta Informatica 1: 173-189 (1972)".
Sample implementation in C++
editCAREFUL: THE FOLLOWING CODE EXAMPLES DO NOT IMPLEMENT A PROPER B+-TREE. While they are very similar to a B+-Tree, they do not fulfill the B+-Tree criteria (as the authors admit in some comments).
This code snippet has been tested under Linux on a 32-bit x86 computer. Deletion of keys has not been implemented yet. It can be done quite easily in a lazy way with an amortized cost of O(log n), by rebuilding the tree from scratch every time that there are as many deleted keys as non-deleted keys. The rebuilding can be done in O(n) time, so its amortized cost is only O(1). This approach, however, would not be appropriate for real time systems.
The implementation uses the Boost library to have compile-time assertions and efficient memory allocation. The latter could be done with the new operator
instead, resulting in some performance penalty.
/* Copyright (C) 2006 David Garcia
* You may use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of this software subject to the following conditions:
* THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE
* OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#if !defined BPLUSTREE_HPP_227824
#define BPLUSTREE_HPP_227824
// This is required for glibc to define std::posix_memalign
#if !defined(_XOPEN_SOURCE) || (_XOPEN_SOURCE < 600)
#define _XOPEN_SOURCE 600
#endif
#include <stdlib.h>
#include <assert.h>
#include <boost/static_assert.hpp>
#include <boost/pool/object_pool.hpp>
// DEBUG
#include <iostream>
using std::cout;
using std::endl;
#if defined DEBUG_ASSERT
# warning "DEBUG_ASSERT overloaded"
#else
# if defined DEBUG
# define DEBUG_ASSERT(expr) assert(expr)
# else
# define DEBUG_ASSERT(expr)
# endif
#endif
template <typename KEY, typename VALUE, unsigned N, unsigned M,
unsigned INNER_NODE_PADDING= 0, unsigned LEAF_NODE_PADDING= 0,
unsigned NODE_ALIGNMENT= 64>
class BPlusTree
{
public:
// N must be greater than two to make the split of
// two inner nodes sensible.
BOOST_STATIC_ASSERT(N>2);
// Leaf nodes must be able to hold at least one element
BOOST_STATIC_ASSERT(M>0);
// Builds a new empty tree.
BPlusTree()
: depth(0),
root(new_leaf_node())
{
// DEBUG
// cout << "sizeof(LeafNode)==" << sizeof(LeafNode) << endl;
// cout << "sizeof(InnerNode)==" << sizeof(InnerNode) << endl;
}
~BPlusTree() {
// Empty. Memory deallocation is done automatically
// when innerPool and leafPool are destroyed.
}
// Inserts a pair (key, value). If there is a previous pair with
// the same key, the old value is overwritten with the new one.
void insert(KEY key, VALUE value) {
InsertionResult result;
bool was_split;
if( depth == 0 ) {
// The root is a leaf node
DEBUG_ASSERT( *reinterpret_cast<NodeType*>(root) ==
NODE_LEAF);
was_split= leaf_insert(reinterpret_cast<LeafNode*>
(root), key, value, &result);
} else {
// The root is an inner node
DEBUG_ASSERT( *reinterpret_cast<NodeType*>
(root) == NODE_INNER );
was_split= inner_insert(reinterpret_cast<InnerNode*>
(root), depth, key, value, &result);
}
if( was_split ) {
// The old root was splitted in two parts.
// We have to create a new root pointing to them
depth++;
root= new_inner_node();
InnerNode* rootProxy=
reinterpret_cast<InnerNode*>(root);
rootProxy->num_keys= 1;
rootProxy->keys[0]= result.key;
rootProxy->children[0]= result.left;
rootProxy->children[1]= result.right;
}
}
// Looks for the given key. If it is not found, it returns false,
// if it is found, it returns true and copies the associated value
// unless the pointer is null.
bool find(const KEY& key, VALUE* value= 0) const {
InnerNode* inner;
register void* node= root;
register unsigned d= depth, index;
while( d-- != 0 ) {
inner= reinterpret_cast<InnerNode*>(node);
DEBUG_ASSERT( inner->type == NODE_INNER );
index= inner_position_for(key, inner->keys,
inner->num_keys);
node= inner->children[index];
}
LeafNode* leaf= reinterpret_cast<LeafNode*>(node);
DEBUG_ASSERT( leaf->type == NODE_LEAF );
index= leaf_position_for(key, leaf->keys, leaf->num_keys);
if( leaf->keys[index] == key ) {
if( value != 0 ) {
*value= leaf->values[index];
}
return true;
} else {
return false;
}
}
// Returns the size of an inner node
// It is useful when optimizing performance with cache alignment.
unsigned sizeof_inner_node() const {
return sizeof(InnerNode);
}
// Returns the size of a leaf node.
// It is useful when optimizing performance with cache alignment.
unsigned sizeof_leaf_node() const {
return sizeof(LeafNode);
}
private:
// Used when debugging
enum NodeType {NODE_INNER=0xDEADBEEF, NODE_LEAF=0xC0FFEE};
// Leaf nodes store pairs of keys and values.
struct LeafNode {
#ifdef DEBUG
LeafNode() : type(NODE_LEAF), num_keys(0) {}
const NodeType type;
#else
LeafNode() : num_keys(0) {}
#endif
unsigned num_keys;
KEY keys[M];
VALUE values[M];
unsigned char _pad[LEAF_NODE_PADDING];
};
// Inner nodes store pointers to other nodes interleaved with keys.
struct InnerNode {
#ifdef DEBUG
InnerNode() : type(NODE_INNER), num_keys(0) {}
const NodeType type;
#else
InnerNode() : num_keys(0) {}
#endif
unsigned num_keys;
KEY keys[N];
void* children[N+1];
unsigned char _pad[INNER_NODE_PADDING];
};
// Custom allocator that returns aligned blocks of memory
template <unsigned ALIGNMENT>
struct AlignedMemoryAllocator {
typedef std::size_t size_type;
typedef std::ptrdiff_t difference_type;
static char* malloc(const size_type bytes)
{
void* result;
if( posix_memalign(&result, ALIGNMENT, bytes) != 0 ) {
result= 0;
}
// Alternative: result= std::malloc(bytes);
return reinterpret_cast<char*>(result);
}
static void free(char* const block)
{ std::free(block); }
};
// Returns a pointer to a fresh leaf node.
LeafNode* new_leaf_node() {
LeafNode* result;
//result= new LeafNode();
result= leafPool.construct();
//cout << "New LeafNode at " << result << endl;
return result;
}
// Frees a leaf node previously allocated with new_leaf_node()
void delete_leaf_node(LeafNode* node) {
DEBUG_ASSERT( node->type == NODE_LEAF );
//cout << "Deleting LeafNode at " << node << endl;
// Alternatively: delete node;
leafPool.destroy(node);
}
// Returns a pointer to a fresh inner node.
InnerNode* new_inner_node() {
InnerNode* result;
// Alternatively: result= new InnerNode();
result= innerPool.construct();
//cout << "New InnerNode at " << result << endl;
return result;
}
// Frees an inner node previously allocated with new_inner_node()
void delete_inner_node(InnerNode* node) {
DEBUG_ASSERT( node->type == NODE_INNER );
//cout << "Deleting InnerNode at " << node << endl;
// Alternatively: delete node;
innerPool.destroy(node);
}
// Data type returned by the private insertion methods.
struct InsertionResult {
KEY key;
void* left;
void* right;
};
// Returns the position where 'key' should be inserted in a leaf node
// that has the given keys.
static unsigned leaf_position_for(const KEY& key, const KEY* keys,
unsigned num_keys) {
// Simple linear search. Faster for small values of N or M
unsigned k= 0;
while((k < num_keys) && (keys[k]<key)) {
++k;
}
return k;
/*
// Binary search. It is faster when N or M is > 100,
// but the cost of the division renders it useless
// for smaller values of N or M.
XXX--- needs to be re-checked because the linear search
has changed since the last update to the following ---XXX
int left= -1, right= num_keys, middle;
while( right -left > 1 ) {
middle= (left+right)/2;
if( keys[middle] < key) {
left= middle;
} else {
right= middle;
}
}
//assert( right == k );
return unsigned(right);
*/
}
// Returns the position where 'key' should be inserted in an inner node
// that has the given keys.
static unsigned inner_position_for(const KEY& key, const KEY* keys,
unsigned num_keys) {
// Simple linear search. Faster for small values of N or M
unsigned k= 0;
while((k < num_keys) && ((keys[k]<key) || (keys[k]==key))) {
++k;
}
return k;
// Binary search is faster when N or M is > 100,
// but the cost of the division renders it useless
// for smaller values of N or M.
}
bool leaf_insert(LeafNode* node, KEY& key,
VALUE& value, InsertionResult* result) {
DEBUG_ASSERT( node->type == NODE_LEAF );
assert( node->num_keys <= M );
bool was_split= false;
// Simple linear search
unsigned i= leaf_position_for(key, node->keys, node->num_keys);
if( node->num_keys == M ) {
// The node was full. We must split it
unsigned treshold= (M+1)/2;
LeafNode* new_sibling= new_leaf_node();
new_sibling->num_keys= node->num_keys -treshold;
for(unsigned j=0; j < new_sibling->num_keys; ++j) {
new_sibling->keys[j]= node->keys[treshold+j];
new_sibling->values[j]=
node->values[treshold+j];
}
node->num_keys= treshold;
if( i < treshold ) {
// Inserted element goes to left sibling
leaf_insert_nonfull(node, key, value, i);
} else {
// Inserted element goes to right sibling
leaf_insert_nonfull(new_sibling, key, value,
i-treshold);
}
// Notify the parent about the split
was_split= true;
result->key= new_sibling->keys[0];
result->left= node;
result->right= new_sibling;
} else {
// The node was not full
leaf_insert_nonfull(node, key, value, i);
}
return was_split;
}
static void leaf_insert_nonfull(LeafNode* node, KEY& key, VALUE& value,
unsigned index) {
DEBUG_ASSERT( node->type == NODE_LEAF );
assert( node->num_keys < M );
assert( index <= M );
assert( index <= node->num_keys );
if( (index < M) && (node->num_keys!=0) && (node->keys[index] == key) ) {
// We are inserting a duplicate value.
// Simply overwrite the old one
node->values[index]= value;
} else {
// The key we are inserting is unique
for(unsigned i=node->num_keys; i > index; --i) {
node->keys[i]= node->keys[i-1];
node->values[i]= node->values[i-1];
}
node->num_keys++;
node->keys[index]= key;
node->values[index]= value;
}
}
bool inner_insert(InnerNode* node, unsigned depth, KEY& key,
VALUE& value, InsertionResult* result) {
DEBUG_ASSERT( node->type == NODE_INNER );
assert( depth != 0 );
// Early split if node is full.
// This is not the canonical algorithm for B+ trees,
// but it is simpler and does not break the definition.
bool was_split= false;
if( node->num_keys == N ) {
// Split
unsigned treshold= (N+1)/2;
InnerNode* new_sibling= new_inner_node();
new_sibling->num_keys= node->num_keys -treshold;
for(unsigned i=0; i < new_sibling->num_keys; ++i) {
new_sibling->keys[i]= node->keys[treshold+i];
new_sibling->children[i]=
node->children[treshold+i];
}
new_sibling->children[new_sibling->num_keys]=
node->children[node->num_keys];
node->num_keys= treshold-1;
// Set up the return variable
was_split= true;
result->key= node->keys[treshold-1];
result->left= node;
result->right= new_sibling;
// Now insert in the appropriate sibling
if( key < result->key ) {
inner_insert_nonfull(node, depth, key, value);
} else {
inner_insert_nonfull(new_sibling, depth, key,
value);
}
} else {
// No split
inner_insert_nonfull(node, depth, key, value);
}
return was_split;
}
void inner_insert_nonfull(InnerNode* node, unsigned depth, KEY& key,
VALUE& value) {
DEBUG_ASSERT( node->type == NODE_INNER );
assert( node->num_keys < N );
assert( depth != 0 );
// Simple linear search
unsigned index= inner_position_for(key, node->keys,
node->num_keys);
InsertionResult result;
bool was_split;
if( depth-1 == 0 ) {
// The children are leaf nodes
for(unsigned kk=0; kk < node->num_keys+1; ++kk) {
DEBUG_ASSERT( *reinterpret_cast<NodeType*>
(node->children[kk]) == NODE_LEAF );
}
was_split= leaf_insert(reinterpret_cast<LeafNode*>
(node->children[index]), key, value, &result);
} else {
// The children are inner nodes
for(unsigned kk=0; kk < node->num_keys+1; ++kk) {
DEBUG_ASSERT( *reinterpret_cast<NodeType*>
(node->children[kk]) == NODE_INNER );
}
InnerNode* child= reinterpret_cast<InnerNode*>
(node->children[index]);
was_split= inner_insert( child, depth-1, key, value,
&result);
}
if( was_split ) {
if( index == node->num_keys ) {
// Insertion at the rightmost key
node->keys[index]= result.key;
node->children[index]= result.left;
node->children[index+1]= result.right;
node->num_keys++;
} else {
// Insertion not at the rightmost key
node->children[node->num_keys+1]=
node->children[node->num_keys];
for(unsigned i=node->num_keys; i!=index; --i) {
node->children[i]= node->children[i-1];
node->keys[i]= node->keys[i-1];
}
node->children[index]= result.left;
node->children[index+1]= result.right;
node->keys[index]= result.key;
node->num_keys++;
}
} // else the current node is not affected
}
typedef AlignedMemoryAllocator<NODE_ALIGNMENT> AlignedAllocator;
// Node memory allocators. IMPORTANT NOTE: they must be declared
// before the root to make sure that they are properly initialised
// before being used to allocate any node.
boost::object_pool<InnerNode, AlignedAllocator> innerPool;
boost::object_pool<LeafNode, AlignedAllocator> leafPool;
// Depth of the tree. A tree of depth 0 only has a leaf node.
unsigned depth;
// Pointer to the root node. It may be a leaf or an inner node, but
// it is never null.
void* root;
};
#endif // !defined BPLUSTREE_HPP_227824
Violate the B+Tree definition during inner node split
editAfter convert it into Java code, I realize the following code doing split insertInner() violate the B+ Tree definition For example, what if N = 4, so after you split, we end up with 3 nodes, root node with 1 key, original node with 1 key, and the sibling node with 3 keys. So it (original node) violates the B+ Tree rules at least half full in the inner node. Look like we still need the canonical algorithm to make it right. Not shortcut here. So this algorithm is not strict B+ Tree, but it works. Anyway, I convert above C++ code to Java code shown below.
It doesn't really work because generic arrays cannot be created by casting from a newly created array of a concrete ancestor like Object.
ie. T[] a = new T[10] won't work. see the field declarations of LNode.
Sample Implementation In Java
edit/**
* B+ Tree
* If you understand B+ or B Tree better, M & N don't need to be the same
* Here is an example of M=N=4, with 12 keys
*
* 5
* / \
* 3 7 9
* / \ / | \
* 1 2 3 4 5 6 7 8 9 10 11 12
* @author jwang01
* @version 1.0.0 created on May 19, 2006
* edited by Spoon! 2008
* edited by Mistro 2010
*/
public class BxTree<Key extends Comparable<? super Key>, Value>
{
/** Pointer to the root node. It may be a leaf or an inner node, but it is never null. */
private Node root;
/** the maximum number of keys in the leaf node, M must be > 0 */
private final int M;
/** the maximum number of keys in inner node, the number of pointer is N+1, N must be > 2 */
private final int N;
/** Create a new empty tree. */
public BxTree(int n) {
this(n, n);
}
public BxTree(int m, int n) {
M = m;
N = n;
root = new LNode();
}
public void insert(Key key, Value value) {
System.out.println("insert key=" + key);
Split result = root.insert(key, value);
if (result != null) {
// The old root was split into two parts.
// We have to create a new root pointing to them
INode _root = new INode();
_root.num = 1;
_root.keys[0] = result.key;
_root.children[0] = result.left;
_root.children[1] = result.right;
root = _root;
}
}
/**
* Looks for the given key. If it is not found, it returns null.
* If it is found, it returns the associated value.
*/
public Value find(Key key) {
Node node = root;
while (node instanceof BxTree.INode) { // need to traverse down to the leaf
INode inner = (INode) node;
int idx = inner.getLoc(key);
node = inner.children[idx];
}
//We are @ leaf after while loop
LNode leaf = (LNode) node;
int idx = leaf.getLoc(key);
if (idx < leaf.num && leaf.keys[idx].equals(key)) {
return leaf.values[idx];
} else {
return null;
}
}
public void dump() {
root.dump();
}
abstract class Node {
protected int num; //number of keys
protected Key[] keys;
abstract public int getLoc(Key key);
// returns null if no split, otherwise returns split info
abstract public Split insert(Key key, Value value);
abstract public void dump();
}
class LNode extends Node {
// In some sense, the following casts are almost always illegal
// (if Value was replaced with a real type other than Object,
// the cast would fail); but they make our code simpler
// by allowing us to pretend we have arrays of certain types.
// They work because type erasure will erase the type variables.
// It will break if we return it and other people try to use it.
final Value[] values = (Value[]) new Object[M];
{ keys = (Key[]) new Comparable[M]; }
/**
* Returns the position where 'key' should be inserted in a leaf node
* that has the given keys.
*/
public int getLoc(Key key) {
// Simple linear search. Faster for small values of N or M, binary search would be faster for larger M / N
for (int i = 0; i < num; i++) {
if (keys[i].compareTo(key) >= 0) {
return i;
}
}
return num;
}
public Split insert(Key key, Value value) {
// Simple linear search
int i = getLoc(key);
if (this.num == M) { // The node was full. We must split it
int mid = (M+1)/2;
int sNum = this.num - mid;
LNode sibling = new LNode();
sibling.num = sNum;
System.arraycopy(this.keys, mid, sibling.keys, 0, sNum);
System.arraycopy(this.values, mid, sibling.values, 0, sNum);
this.num = mid;
if (i < mid) {
// Inserted element goes to left sibling
this.insertNonfull(key, value, i);
} else {
// Inserted element goes to right sibling
sibling.insertNonfull(key, value, i-mid);
}
// Notify the parent about the split
Split result = new Split(sibling.keys[0], //make the right's key >= result.key
this,
sibling);
return result;
} else {
// The node was not full
this.insertNonfull(key, value, i);
return null;
}
}
private void insertNonfull(Key key, Value value, int idx) {
//if (idx < M && keys[idx].equals(key)) {
if (idx < num && keys[idx].equals(key)) {
// We are inserting a duplicate value, simply overwrite the old one
values[idx] = value;
} else {
// The key we are inserting is unique
System.arraycopy(keys, idx, keys, idx+1, num-idx);
System.arraycopy(values, idx, values, idx+1, num-idx);
keys[idx] = key;
values[idx] = value;
num++;
}
}
public void dump() {
System.out.println("lNode h==0");
for (int i = 0; i < num; i++){
System.out.println(keys[i]);
}
}
}
class INode extends Node {
final Node[] children = new BxTree.Node[N+1];
{ keys = (Key[]) new Comparable[N]; }
/**
* Returns the position where 'key' should be inserted in an inner node
* that has the given keys.
*/
public int getLoc(Key key) {
// Simple linear search. Faster for small values of N or M
for (int i = 0; i < num; i++) {
if (keys[i].compareTo(key) > 0) {
return i;
}
}
return num;
// Binary search is faster when N or M is big,
}
public Split insert(Key key, Value value) {
/* Early split if node is full.
* This is not the canonical algorithm for B+ trees,
* but it is simpler and it does break the definition
* which might result in immature split, which might not be desired in database
* because additional split lead to tree's height increase by 1, thus the number of disk read
* so first search to the leaf, and split from bottom up is the correct approach.
*/
if (this.num == N) { // Split
int mid = (N+1)/2;
int sNum = this.num - mid;
INode sibling = new INode();
sibling.num = sNum;
System.arraycopy(this.keys, mid, sibling.keys, 0, sNum);
System.arraycopy(this.children, mid, sibling.children, 0, sNum+1);
this.num = mid-1;//this is important, so the middle one elevate to next depth(height), inner node's key don't repeat itself
// Set up the return variable
Split result = new Split(this.keys[mid-1],
this,
sibling);
// Now insert in the appropriate sibling
if (key.compareTo(result.key) < 0) {
this.insertNonfull(key, value);
} else {
sibling.insertNonfull(key, value);
}
return result;
} else {// No split
this.insertNonfull(key, value);
return null;
}
}
private void insertNonfull(Key key, Value value) {
// Simple linear search
int idx = getLoc(key);
Split result = children[idx].insert(key, value);
if (result != null) {
if (idx == num) {
// Insertion at the rightmost key
keys[idx] = result.key;
children[idx] = result.left;
children[idx+1] = result.right;
num++;
} else {
// Insertion not at the rightmost key
//shift i>idx to the right
System.arraycopy(keys, idx, keys, idx+1, num-idx);
System.arraycopy(children, idx, children, idx+1, num-idx+1);
children[idx] = result.left;
children[idx+1] = result.right;
keys[idx] = result.key;
num++;
}
} // else the current node is not affected
}
/**
* This one only dump integer key
*/
public void dump() {
System.out.println("iNode h==?");
for (int i = 0; i < num; i++){
children[i].dump();
System.out.print('>');
System.out.println(keys[i]);
}
children[num].dump();
}
}
class Split {
public final Key key;
public final Node left;
public final Node right;
public Split(Key k, Node l, Node r) {
key = k; left = l; right = r;
}
}
}