python pandas库的应用（类比mysql语言）

转载自：http://blog.csdn.net/awayyao/article/details/17785473

create table 我们尝试创建一张表，保存每天的三个值

In[6]:

from pandas import *
import numpy as np
randn = np.random.randn
random_integers = np.random.random_integers

# create table df (id int, dt datetime, a float, b float, c float, primary key (id));
df = DataFrame(randn(30, 3), date_range('1/1/2013', periods=30), columns=list('abc'))
# select * from df order by id asc limit 7;
df.head(7)

Out[6]:

	a	b	c
2013-01-01	-1.826059	-2.131391	1.094457
2013-01-02	-0.726806	0.928534	0.006347
2013-01-03	0.427909	-0.897453	1.803147
2013-01-04	-1.549575	-0.045432	0.086797
2013-01-05	-2.640359	-0.190043	-0.734281
2013-01-06	0.028888	-0.315157	-0.043611
2013-01-07	-1.314698	1.082857	-0.895359

select 语句

In[7]:

# select a,c from df limit 7
df.loc[:,['a','c']].head(7)

Out[7]:

	a	c
2013-01-01	-1.826059	1.094457
2013-01-02	-0.726806	0.006347
2013-01-03	0.427909	1.803147
2013-01-04	-1.549575	0.086797
2013-01-05	-2.640359	-0.734281
2013-01-06	0.028888	-0.043611
2013-01-07	-1.314698	-0.895359

where 子句

In[8]:

# select a,c from df where dt > "2013-01-03" and dt < "2013-01-10" 
df.loc[(df.index>"2013-01-03") & (df.index<"2013-01-10"),['a','c']]

Out[8]:

	a	c
2013-01-04	-1.549575	0.086797
2013-01-05	-2.640359	-0.734281
2013-01-06	0.028888	-0.043611
2013-01-07	-1.314698	-0.895359
2013-01-08	-0.613846	1.317529
2013-01-09	0.814209	1.403127

order 子句

In[9]:

# select a,c from df where dt > "2013-01-03" and dt < "2013-01-10" order by a asc, c desc
df.loc[(df.index>"2013-01-03") & (df.index<"2013-01-10"),['a','c']].sort_index(by=['a','c'], ascending=[1,0])

Out[9]:

	a	c
2013-01-05	-2.640359	-0.734281
2013-01-04	-1.549575	0.086797
2013-01-07	-1.314698	-0.895359
2013-01-08	-0.613846	1.317529
2013-01-06	0.028888	-0.043611
2013-01-09	0.814209	1.403127

接下来我们尝试新建一张存放每天app_id安装记录的表，app_id是app的id，total是安装数。

insert 子句

In[10]:

# create table install_log (id int, dt datetime, app_id int, total int, primary key (id))
install_log = DataFrame(columns=['app_id', 'total'])

# insert into install_log values(NULL, '2013-01-01', 100, 234);
# for aggregation, 10 times
app100_install = DataFrame([dict(app_id=100, total=random_integers(1000)) for i in range(30)],index=date_range('1/1/2013', periods=30))
install_log = install_log.append(app100_install)
app101_install = DataFrame([dict(app_id=101, total=random_integers(1000)) for i in range(30)],index=date_range('1/1/2013', periods=30))
install_log = install_log.append(app101_install)
install_log.head(3), install_log.tail(3)

Out[10]:

(            app_id  total
2013-01-01     100    884
2013-01-02     100     73
2013-01-03     100     13,
             app_id  total
2013-01-28     101    597
2013-01-29     101    267
2013-01-30     101    500)

group by 子句

In[11]:

# select app_id, sum(total), avg(total) from install_log group by app_id;
for column_value, column_rows in install_log.groupby('app_id'):
    print column_value, column_rows['total'].sum(), column_rows['total'].mean()

100 16622 554.066666667
101 12699 423.3

还有另一张表维护app_id和app名字之间的映射关系

In[12]:

# create table dimapp (id int, app_id int, app_name varchar(255), primary key (id));
# insert into dimapp values(NULL, 100, 'facebook'), (NULL, 101, 'twitter'));
dimapp = DataFrame([dict(app_id=100, app_name='facebook'), dict(app_id=101, app_name='twitter')])
dimapp = dimapp.set_index('app_id')
dimapp

Out[12]:

	app_name
app_id
100	facebook
101	twitter

我们把最后的结果进行join

In[13]:

# select dimapp.name, t.sum_total from (select app_id,sum(total) as sum_total from install_log group by app_id)t join dimapp on t.app_id=t.app_id
grouped = install_log.groupby('app_id').aggregate(np.sum)
result = grouped.join(dimapp).set_index('app_name')
result.plot(kind='bar')

Out[13]:

更多相关文章

随机推荐