带权重的随机

今天想写一个where_to_eat_lunch的小程序，帮助选择中午吃饭的地方，其实是练习随机选择的问题。（让我想到了Knuth大神的n个中均匀抽样m个数并且有序的方法，这个还是自行再去看看编程珠玑的抽样问题吧）

本来以为是一个很容易搞定的问题，结果遇到了各种坑。

坑一：
time() 函数的问题。开始的时候没有#include <time.h> 竟然编译通过了，但是这个函数有问题，至今没有搞懂奇怪的行为。描述一下。

[root@localhost ~]# cat 1.c
#include <stdio.h>

//#include <time.h>


int main(int argc, char ** argv)
{
    unsigned long my_seed = 0;
    //my_seed = (unsigned long)time(NULL);
    my_seed = (unsigned long)time();
    int i = 0;

    srand(my_seed);
    for(i = 0; i < 100; ++i){
        printf("%d\n", rand());
    }


    return 0;
}

[root@localhost ~]# gcc 1.c
[root@localhost ~]# ldd a.out
    linux-vdso.so.1 =>  (0x00007fff5a5fe000)
    libc.so.6 => /lib64/libc.so.6 (0x00007f8719dbc000)
    /lib64/ld-linux-x86-64.so.2 (0x00007f871a184000)
[root@localhost ~]# nm a.out | grep time
                 U time@@GLIBC_2.2.5

看似就是普通glibc里的函数。但是竟然不要参数！！！在<time.h>里的time是需要参数的！而且这个a.out运行会coredump

[root@localhost ~]# ./a.out
Segmentation fault (core dumped)
[New LWP 7844]
Core was generated by `./a.out'.
Program terminated with signal 11, Segmentation fault.
#0  0x00007fff5b9fefe1 in time ()
Missing separate debuginfos, use: debuginfo-install glibc-2.17-55.el7.x86_64
(gdb) q

能不能不coredump呢，能！改一下code，非常神奇。

[root@localhost ~]# cat 1.c
#include <stdio.h>

//#include <time.h>


int main(int argc, char ** argv)
{
    int array[100] = {};  //加一行这里

    unsigned long my_seed = 0;
    //my_seed = (unsigned long)time(NULL);
    my_seed = (unsigned long)time();
    int i = 0;

    srand(my_seed);
    for(i = 0; i < 100; ++i){
        printf("%d\n", rand());
    }

    return 0;
}

好吧，这里暂且不管这个神奇的time()。这里主要记住的就是库函数time()是要有参数的！要include<time.h>. 参数可以是NULL但是一定要有。

坑二：
也不算是一个坑。突然想起来看过一篇文章说，谈到人的认知错觉，当人在听歌的时候选择在列表里随机播放，当听到一首歌，下一次再听到，就总觉得听到这一首歌的概率很大，感觉概率是不均匀的。苹果的播放器开始就是随机的播放，但是由于人的这个错觉，乔布斯只好改进了一下，将刚听过的歌曲再次出现的概率降低。也就是降低再次出现的权重。
这里我也想引入权重，每次被选择后权重降低。听起来不错。实现起来发现并没有太好的时间空间复杂度的算法（也许有好的算法，但没想出来~~！）。

先写一个放在这里吧。思路是：
1 每个饭馆有一个对应的rate
2 将rate值累加得到另外一个数组cumulative，像这样

	0号饭馆	1号饭馆	2号饭馆	3号饭馆
rate	1	2.2	3.0	4.5
cumulative	1	3.2	6.2	10.7

总的rate是1 + 2.2 + 3.0 + 4.5 = 10.7
3 每次随机选择一个100以内的数，算一个百分比，比如随机选择了32，那么百分比就是0.32
4 总的rate 乘以这个百分比，这个例子中就是10.7 * 0.32（我说不好这叫什么化，反正就是把100的尺度按比例尺拉成了总的rate这个尺度）
5 那么第4步的这个乘积落在哪个rate的区间上，就取那个rate的index位置，也就是对应的那个饭馆的index位置。

6 用float不好理解，用整型就好理解了

	0号饭馆	1号饭馆	2号饭馆	3号饭馆
rate	1	3	3	3
cumulative	1	4	7	10

也就是说，0号饭馆的权重是1， 1号饭馆的权重是3，2号饭馆的权重是3，3号饭馆的权重是3。一共是10份，0号占1份，1，2和3号各占3份。随机的时候，100个数随机到了10以内，就是0号，随机到了10-40，就是1号。随机到了40-70就是2号，贼好理解。

[root@localhost ~]# cat where_to_eat.c
#include <stdio.h>
#include <stdlib.h>
#include <time.h>

#define NUM 6

void show(float *rate){
    int i = 0;

    for(i = 0; i < NUM ; ++i) {
        printf("%f\n", rate[i]);
    }

    return;
}

void rise_all_rate(float *rate){
    int i = 0;

    for(i = 0; i < NUM; ++i){
        rate[i] *= 100;
    }

    return;
}

int get_random_via_weight(float *rate) {
    unsigned long my_seed = (unsigned long)time(NULL);
    int random_value = 0;
    int i = 0;
    float cumulative[NUM] = {0.0};
    float random_value_percent = 0.0;
    float sum_by_percent = 0.0;

    srand(my_seed);
    random_value = rand() % 100;
    random_value_percent = random_value / 100.0;

    for(cumulative[0] = rate[0],i = 1; i < NUM; ++i){
        cumulative[i] = rate[i] + cumulative[i-1];
    }
    sum_by_percent = cumulative[NUM-1] * random_value_percent;

    for(i = 0; i < NUM; ++i) {
        if(sum_by_percent - cumulative[i] < 0) break;
    }

    rate[i] = rate[i] / 2;
    if(rate[i] < 0.1) {
        rise_all_rate(rate);
    }
    printf("random value is %d, percent is %f, choose %d\n", random_value, random_value_percent, i);
    show(rate);

    return i;
}

int main(int argc, char ** argv)
{
    char *restaurant[NUM] = {"tian", "he", "la",
                                                 "kfc", "mc", "subway"};
    float rate[NUM] = {0.0};
    float *p = rate;
    int i = 0;
    unsigned char c = '\0';

    for(i = 0; i < NUM; ++i) {
        rate[i] = 10.0;
    }

    while(1){
        c = getchar();
        if('y' == c) break;
        i = get_random_via_weight(p);
        printf("Ok, Let's go to %s for lunch.\n", restaurant[i]);
    }

    return 0;
}

这里是有坑的。
1 time(NULL)这玩意返回的和Epoch（1970.1.1）之间的秒数，同一秒内得到的结果一样，也就是seed一样
2 因为机器是伪随机的，每次调用get_random_via_weight的时候，重新开始随机，seed如果一样随机的结果是一样一样的。如果在同一秒内，rand()的返回值是一样的。

所以调整一下，全局的事情全局做，my_seed 弄成了全局的，只srand一次。

[root@localhost ~]# cat where_to_eat.c
#include <stdio.h>
#include <stdlib.h>
#include <time.h>

#define NUM 6

unsigned long my_seed;
void show(float *rate){
    int i = 0;

    for(i = 0; i < NUM ; ++i) {
        printf("%f\n", rate[i]);
    }

    return;
}

void rise_all_rate(float *rate){
    int i = 0;

    for(i = 0; i < NUM; ++i){
        rate[i] *= 100;
    }

    return;
}

int get_random_via_weight(float *rate) {
    static int random_value = 0;
    int i = 0;
    float cumulative[NUM] = {0.0};
    float random_value_percent = 0.0;
    float sum_by_percent = 0.0;

    random_value = rand() % 100;
    random_value_percent = random_value / 100.0;

    for(cumulative[0] = rate[0],i = 1; i < NUM; ++i){
        cumulative[i] = rate[i] + cumulative[i-1];
    }
    sum_by_percent = cumulative[NUM-1] * random_value_percent;

    for(i = 0; i < NUM; ++i) {
        if(sum_by_percent - cumulative[i] < 0) break;
    }

    rate[i] = rate[i] / 2;
    if(rate[i] < 0.1) {
        rise_all_rate(rate);
    }
    printf("random value is %d, percent is %f, choose %d\n", random_value, random_value_percent, i);
    show(rate);

    return i;
}

int main(int argc, char ** argv)
{
    char *restaurant[NUM] = {"tian", "he", "la",
                                                 "kfc", "mc", "subway"};
    float rate[NUM] = {0.0};
    float *p = rate;
    int i = 0;
    unsigned char c = '\0';

    my_seed = (unsigned long)time(NULL);
    srand(my_seed);

    for(i = 0; i < NUM; ++i) {
        rate[i] = 10.0;
    }

    int test = 100;
    while(test--){
#if 0
        c = getchar();
        if('y' == c) break;
#endif
        i = get_random_via_weight(p);
        printf("Ok, Let's go to %s for lunch.\n", restaurant[i]);
    }

    return 0;
}

这个算法在饭馆数组特别大的时候，就不算太好了，累加那么多次。有改进的方案是数组分块，每一块大小可以是sqrt(total_array_size)。然后每次更新权重只是影响一块，块间的cumulative要重新计算。然后随机落到哪个块里了，在那个块里再细细掰扯。