跳表(SkipList)的c++实现和性能测试

程勇uestc

已于 2022-02-28 17:13:58 修改

阅读量2k

点赞数 2

分类专栏：算法系列专题文章标签： skipList 跳表跳表性能测试跳表c++实现跳表概率因子

于 2022-02-28 16:25:57 首次发布

本文链接：https://blog.csdn.net/qq_40438165/article/details/123184655

版权

算法系列专题专栏收录该内容

11 篇文章

订阅专栏

$\quad$ 跳表的理论参考这个PPT，本文主要介绍跳表的实现，以及跳表的性能随着概率因子变化而变化的情况，从而发现最优概率因子的范围。

一、跳表的实现

$\quad$ 跳表节点需要两个指针，一个指向右侧节点，一个指向下方节点，节点定义如下：

struct Node{
    int val;
    Node *next;  // 右边节点
    Node *down;  // 下方节点
    Node(int x){
        val = x;
        next = down = NULL;
    }
};

$\quad$ 定义跳表类SkipList，跳表需要一个头节点，表示最高层最左侧的节点，该节点初始化为最小值。跳表还需要指定一个超参数——概率因子prob，表明以prob的概率向上生成新的一层，相同节点数目下，prob越大跳表深度越深。

class SkipList{
public:
    Node *head;
    double prob;  // 以prob的概率向上生成新的一层，prob越大跳表深度越深
    SkipList(double _prob=0.5){
        prob = _prob;
        head = new Node(INT32_MIN);  // 初始化头节点为最小值
    }
}

$\quad$ 接下来完善跳表类中的查询、删除和插入代码，如下：

bool search(int x){  // 查看跳表中是否存在值为x的节点
    Node *p = head;
    while (p){
        if(p->val == x) return true;
        else if(p->next == NULL) p = p->down;  // 右侧节点为空，只能往下走
        else if(p->next->val > x) p = p->down;  // 右侧节点值大于x，往下走
        else p = p->next;  // 右侧节点值小于x，往右走
    }
    return false;  // p为空还没找到，则不存在
}

void erase(int x){  // 删除跳表中值为x的节点
    Node *p = head;
    while (p){
        if(p->next == NULL) p = p->down;
        else if(p->next->val > x) p = p->down;
        else if(p->next->val == x){
            p->next = p->next->next;  // 删除右侧值为x的节点
            p = p->down;
        }else p = p->next;
    }
}

void insert(int x){  // 往跳表中插入节点
    if(search(x)) return;  // 避免重复插入
    stack<Node *> st;  // 记录向下走过的节点，这一列节点后面可以插入x
    Node *p = head;
    while (p){
        if(p->next == NULL || p->next->val > x){
            st.push(p);  // 右侧没有节点或者右侧节点值更大， 说明新节点应该加在当前节点与右侧节点之间
            p = p->down;
        }else p = p->next;
    }
    Node *down = NULL;  // 保存最后一个插入的节点，如果最后需要额外新增一层时需要用到
    bool f = true;  // 用于保证最下方一层一定能插入新节点
    while (!st.empty()){
        Node *pre = st.top();  // 当前节点右侧用于插入新节点
        st.pop();
        double num = 1.0 * rand() / RAND_MAX;  // 取一个[0, 1)之间的随机数作为概率

        if(f || num < prob){  // 如果是最下面一层或者概率超过prob，则插入节点
            f = false;
            Node *t = new Node(x);
            if(pre->next == NULL) pre->next = t;
            else{
                t->next = pre->next;  // 插入新节点
                pre->next = t;
            }
            t->down = down;
            down = t;
        }else return;
    }
    // 通过随机数判断是否再上升一层
    double num = 1.0 * rand() / RAND_MAX;
    if(num < prob){  // 上升一层需要更新新的head节点
        Node *t = new Node(x);
        Node *minNode = new Node(INT32_MIN);
        minNode->down = head;
        minNode->next = t;
        t->down = down;
        head = minNode;
    }
}

$\quad$ 接着可以给出一些辅助方法，即打印出该跳表和统计该跳表的最大深度、节点数和平均深度等信息，用于后续性能测试：

void show(){
    Node *p = head;
    while (p){
        Node *node = p->next;
        cout << "head";
        while (node){
            cout << "-->" << node->val;
            node = node->next;
        }
        cout << "-->NULL" << endl;
        p = p->down;
    }
}

void info(){  // 统计跳表最大深度，节点个数和节点平均深度
    int maxDepth = 0;  // 最大深度
    unordered_map<Node*, int> level; // 所有节点的深度
    Node *p = head;
    while (p) {
        maxDepth ++;
        Node *node = p->next;
        while (node){
            level[node] = maxDepth;
            node = node->next;
        }
        p = p->down;
    }
    cout << "max depth=" << maxDepth << endl;
    int NodeNum = level.size();  // 节点数目
    int s = 0;
    for(auto it: level) s += it.second;
    double avgLevel = s * 1.0 / NodeNum;  // 平均深度
    unordered_set<int> H;
    for(auto it: level) H.insert(it.first->val);
    double repeatRatio = 1.0 * NodeNum / H.size();
    cout << "node num=" << NodeNum << ", avg level=" << avgLevel << endl;
    cout << "different val num=" << H.size() << ", repeat ratio=" << repeatRatio << endl;
}

二、性能测试

$\quad$ 我们随机产生100万个不同的整数插入进跳表，再依次查询这100万个数，统计在不同概率因子下程序耗时，完整地代码如下：

#include <iostream>
#include <stack>
#include <unordered_map>
#include <unordered_set>
#include <ctime>
#include <vector>
using namespace std;

struct Node{
    int val;
    Node *next;  // 右边节点
    Node *down;  // 下方节点
    Node(int x){
        val = x;
        next = down = NULL;
    }
};

class SkipList{
public:
    Node *head;
    double prob;  // 以prob的概率向上生成新的一层，prob越大跳表深度越深
    SkipList(double _prob=0.5){
        prob = _prob;
        head = new Node(INT32_MIN);  // 初始化头节点为最小值
    }

    bool search(int x){  // 查看跳表中是否存在值为x的节点
        Node *p = head;
        while (p){
            if(p->val == x) return true;
            else if(p->next == NULL) p = p->down;  // 右侧节点为空，只能往下走
            else if(p->next->val > x) p = p->down;  // 右侧节点值大于x，往下走
            else p = p->next;  // 右侧节点值小于x，往右走
        }
        return false;  // p为空还没找到，则不存在
    }

    void erase(int x){  // 删除跳表中值为x的节点
        Node *p = head;
        while (p){
            if(p->next == NULL) p = p->down;
            else if(p->next->val > x) p = p->down;
            else if(p->next->val == x){
                p->next = p->next->next;  // 删除右侧值为x的节点
                p = p->down;
            }else p = p->next;
        }
    }

    void insert(int x){  // 往跳表中插入节点
        if(search(x)) return;  // 避免重复插入
        stack<Node *> st;  // 记录向下走过的节点，这一列节点后面可以插入x
        Node *p = head;
        while (p){
            if(p->next == NULL || p->next->val > x){
                st.push(p);  // 右侧没有节点或者右侧节点值更大， 说明新节点应该加在当前节点与右侧节点之间
                p = p->down;
            }else p = p->next;
        }
        Node *down = NULL;  // 保存最后一个插入的节点，如果最后需要额外新增一层时需要用到
        bool f = true;  // 用于保证最下方一层一定能插入新节点
        while (!st.empty()){
            Node *pre = st.top();  // 当前节点右侧用于插入新节点
            st.pop();
            double num = 1.0 * rand() / RAND_MAX;  // 取一个[0, 1)之间的随机数作为概率

            if(f || num < prob){  // 如果是最下面一层或者概率超过prob，则插入节点
                f = false;
                Node *t = new Node(x);
                if(pre->next == NULL) pre->next = t;
                else{
                    t->next = pre->next;  // 插入新节点
                    pre->next = t;
                }
                t->down = down;
                down = t;
            }else return;
        }
        // 通过随机数判断是否再上升一层
        double num = 1.0 * rand() / RAND_MAX;
        if(num < prob){  // 上升一层需要更新新的head节点
            Node *t = new Node(x);
            Node *minNode = new Node(INT32_MIN);
            minNode->down = head;
            minNode->next = t;
            t->down = down;
            head = minNode;
        }
    }

    void show(){
        Node *p = head;
        while (p){
            Node *node = p->next;
            cout << "head";
            while (node){
                cout << "-->" << node->val;
                node = node->next;
            }
            cout << "-->NULL" << endl;
            p = p->down;
        }
    }

    void info(){  // 统计跳表最大深度，节点个数和节点平均深度
        int maxDepth = 0;  // 最大深度
        unordered_map<Node*, int> level; // 所有节点的深度
        Node *p = head;
        while (p) {
            maxDepth ++;
            Node *node = p->next;
            while (node){
                level[node] = maxDepth;
                node = node->next;
            }
            p = p->down;
        }
        cout << "max depth=" << maxDepth << endl;
        int NodeNum = level.size();  // 节点数目
        int s = 0;
        for(auto it: level) s += it.second;
        double avgLevel = s * 1.0 / NodeNum;  // 平均深度
        unordered_set<int> H;
        for(auto it: level) H.insert(it.first->val);
        double repeatRatio = 1.0 * NodeNum / H.size();
        cout << "node num=" << NodeNum << ", avg level=" << avgLevel << endl;
        cout << "different val num=" << H.size() << ", repeat ratio=" << repeatRatio << endl;
    }
};

void testTime(vector<int> &nums, double prob){
    clock_t start, end;
    start = clock();
    auto sp = new SkipList(prob);
    for(int num: nums){
        sp->insert(num);
    }
    for(int num: nums){
        sp->search(num);
    }
    end = clock();
    cout << "test size=" << nums.size() << ", prob=" << prob << endl;
    cout << "cost time=" << (double)(end - start) / CLOCKS_PER_SEC << "s" << endl;
    sp->info();
    cout << "--------------------------------" << endl;
}
int main(){
    int n = 1000000;
    unordered_set<int> nums;
    while (nums.size() < n){
        nums.insert(rand() * rand());
    }
    vector<int> arr(nums.begin(), nums.end());

    for(int i = 1; i <= 9; i ++ ){
        testTime(arr, i * 1.0 / 10);
    }
}

在这里插入图片描述
$\quad$ 如上图所示，当概率因子为0.2时耗费时间3.4s，效率最高，此时跳表最大深度为6，平均深度为5.8892，跳表中节点个数为111w，意味着平均每个数被重复1.11次。下表给出了各个概率因子下的各具体参数：

概率因子	0.1	0.2	0.3	0.4	0.5	0.6	0.7	0.8	0.9
耗费时间(s)	4.229	3.402	3.458	3.664	3.833	3.996	4.376	5.224	7.481
跳表最大深度	6	9	11	17	18	27	35	55	109
跳表中节点数目	1111198	1250773	1428339	1667249	1998415	2500761	3330720	4994432	9997933
跳表中节点的平均深度	5.8892	8.74889	10.5719	16.332	17.0013	25.4991	32.6704	51.0072	99.9984
跳表中节点的重复率	1.1112	1.25077	1.42834	1.66725	1.99842	2.50076	3.33072	4.99443	9.99793

$\quad$ 经过更细粒度的实验，得到概率因子最佳值为0.27，此时耗时3s左右，最大深度11，平均深度10.63。
$\quad$ 使用相同的数据和测试方法，使用set耗时2.2s，使用unordered_set耗时0.6s，使用map耗时2s，使用unordered_map耗时0.7s。由此可见，c++自带的set和map性能更优，原因在于其不存在重复节点的情况。跳表主要是实现简单，其实性能和空间占用都比c++ stl更差。

三、Java版SkipList

import java.util.*;

public class SkipList {
    private class Node{
        int val;
        Node next;
        Node down;
        Node(int x){
            this.val = x;
            this.next = this.down = null;
        }
    }
    private Node head;
    private double prob;  // 概率因子
    SkipList(double prob){
        this.prob = prob;
        this.head = new Node(Integer.MIN_VALUE);
    }
    public boolean search(int x){
        Node p = head;
        while (p != null){
            if(p.val == x) return true;
            if(p.next == null || p.next.val > x) p = p.down;
            else p = p.next;
        }
        return false;
    }

    public void erase(int x){
        Node p = head;
        while (p != null){
            if(p.next == null || p.next.val > x) p = p.down;
            else if(p.next.val == x){
                p.next = p.next.next;
                p = p.down;
            }else p = p.next;
        }
    }

    public void insert(int x){
        if(search(x)) return;
        Stack<Node> st = new Stack<>();
        Node p = head;
        while (p != null){
            if(p.next == null || p.next.val > x){
                st.add(p);
                p = p.down;
            }else p = p.next;
        }
        Node down = null;
        boolean f = true;
        while(!st.isEmpty()){
            Node pre = st.pop();
            double num = Math.random();
            if(f || num < this.prob){
                f = false;
                Node t = new Node(x);
                if(pre.next == null) pre.next = t;
                else{
                    t.next = pre.next;
                    pre.next = t;
                }
                t.down = down;
                down = t;
            }else return;
        }
        double num = Math.random();
        if(num < this.prob){
            Node t = new Node(x);
            Node minNode = new Node(Integer.MIN_VALUE);
            minNode.down = head;
            minNode.next = t;
            t.down = down;
            head = minNode;
        }
    }

    public void print(){
        Node p = head;
        while (p != null){
            Node node = p.next;
            System.out.print("head");
            while (node != null){
                System.out.print("-->" + node.val);
                node = node.next;
            }
            System.out.println("-->NULL");
            p = p.down;
        }
    }

    public static void main(String[] args) {
        int n = 1000000;
        Set<Integer> st = new HashSet<>();
        while (st.size() < n){
            st.add((int)(Math.random() * 1000000000));
        }
        ArrayList<Integer> arr = new ArrayList<>();
        for(Integer num: st){
            arr.add(num);
        }
        long start = System.currentTimeMillis();
        SkipList sp = new SkipList(0.27);
        for(Integer num: arr){
            sp.insert(num);
        }
        for(Integer num: arr){
            sp.search(num);
        }
        long end = System.currentTimeMillis();
        System.out.println("SkipList cost time=" + (end - start) / 1000.0 + "s");

        start = System.currentTimeMillis();
        Set<Integer> set = new TreeSet<>();
        for(Integer num: arr){
            set.add(num);
        }
        for(Integer num: arr){
            set.contains(num);
        }
        end = System.currentTimeMillis();
        System.out.println("TreeSet cost time=" + (end - start) / 1000.0 + "s");
    }
}