数据迭代和数据遍历都是按照某种顺序逐个对数据进行访问和操作,在 Python 中大多由 for
语句来引导。Pandas 中的迭代操作可以将数据按行或者按列遍历,我们可以进行更加细化、个性化的数据处理。
数据准备
- pyodps_iris
-- drop table pyodps_iris; create table if not exists pyodps_iris ( sepallength double comment '片长度(cm)', sepalwidth double comment '片宽度(cm)', petallength double comment '瓣长度(cm)', petalwidth double comment '瓣宽度(cm)', name string comment '种类' );
- 数据集
insert overwrite table pyodps_iris values (5.1,3.5,1.4,0.2,'Iris-setosa'), (4.9,3,1.4,0.2,'Iris-setosa'), (4.7,3.2,1.3,0.2,'Iris-setosa'), (4.6,3.1,1.5,0.2,'Iris-setosa'), (5,3.6,1.4,0.2,'Iris-setosa'), (5.4,3.9,1.7,0.4,'Iris-setosa'), (4.6,3.4,1.4,0.3,'Iris-setosa'), (5,3.4,1.5,0.2,'Iris-setosa'), (4.4,2.9,1.4,0.2,'Iris-setosa'), (4.9,3.1,1.5,0.1,'Iris-setosa'), (5.4,3.7,1.5,0.2,'Iris-setosa'), (4.8,3.4,1.6,0.2,'Iris-setosa'), (4.8,3,1.4,0.1,'Iris-setosa'), (4.3,3,1.1,0.1,'Iris-setosa'), (5.8,4,1.2,0.2,'Iris-setosa'), (5.7,4.4,1.5,0.4,'Iris-setosa'), (5.4,3.9,1.3,0.4,'Iris-setosa'), (5.1,3.5,1.4,0.3,'Iris-setosa'), (5.7,3.8,1.7,0.3,'Iris-setosa'), (5.1,3.8,1.5,0.3,'Iris-setosa'), (5.4,3.4,1.7,0.2,'Iris-setosa'), (5.1,3.7,1.5,0.4,'Iris-setosa'), (4.6,3.6,1,0.2,'Iris-setosa'), (5.1,3.3,1.7,0.5,'Iris-setosa'), (4.8,3.4,1.9,0.2,'Iris-setosa'), (5,3,1.6,0.2,'Iris-setosa'), (5,3.4,1.6,0.4,'Iris-setosa'), (5.2,3.5,1.5,0.2,'Iris-setosa'), (5.2,3.4,1.4,0.2,'Iris-setosa'), (4.7,3.2,1.6,0.2,'Iris-setosa'), (4.8,3.1,1.6,0.2,'Iris-setosa'), (5.4,3.4,1.5,0.4,'Iris-setosa'), (5.2,4.1,1.5,0.1,'Iris-setosa'), (5.5,4.2,1.4,0.2,'Iris-setosa'), (4.9,3.1,1.5,0.1,'Iris-setosa'), (5,3.2,1.2,0.2,'Iris-setosa'), (5.5,3.5,1.3,0.2,'Iris-setosa'), (4.9,3.1,1.5,0.1,'Iris-setosa'), (4.4,3,1.3,0.2,'Iris-setosa'), (5.1,3.4,1.5,0.2,'Iris-setosa'), (5,3.5,1.3,0.3,'Iris-setosa'), (4.5,2.3,1.3,0.3,'Iris-setosa'), (4.4,3.2,1.3,0.2,'Iris-setosa'), (5,3.5,1.6,0.6,'Iris-setosa'), (5.1,3.8,1.9,0.4,'Iris-setosa'), (4.8,3,1.4,0.3,'Iris-setosa'), (5.1,3.8,1.6,0.2,'Iris-setosa'), (4.6,3.2,1.4,0.2,'Iris-setosa'), (5.3,3.7,1.5,0.2,'Iris-setosa'), (5,3.3,1.4,0.2,'Iris-setosa'), (7,3.2,4.7,1.4,'Iris-versicolor'), (6.4,3.2,4.5,1.5,'Iris-versicolor'), (6.9,3.1,4.9,1.5,'Iris-versicolor'), (5.5,2.3,4,1.3,'Iris-versicolor'), (6.5,2.8,4.6,1.5,'Iris-versicolor'), (5.7,2.8,4.5,1.3,'Iris-versicolor'), (6.3,3.3,4.7,1.6,'Iris-versicolor'), (4.9,2.4,3.3,1,'Iris-versicolor'), (6.6,2.9,4.6,1.3,'Iris-versicolor'), (5.2,2.7,3.9,1.4,'Iris-versicolor'), (5,2,3.5,1,'Iris-versicolor'), (5.9,3,4.2,1.5,'Iris-versicolor'), (6,2.2,4,1,'Iris-versicolor'), (6.1,2.9,4.7,1.4,'Iris-versicolor'), (5.6,2.9,3.6,1.3,'Iris-versicolor'), (6.7,3.1,4.4,1.4,'Iris-versicolor'), (5.6,3,4.5,1.5,'Iris-versicolor'), (5.8,2.7,4.1,1,'Iris-versicolor'), (6.2,2.2,4.5,1.5,'Iris-versicolor'), (5.6,2.5,3.9,1.1,'Iris-versicolor'), (5.9,3.2,4.8,1.8,'Iris-versicolor'), (6.1,2.8,4,1.3,'Iris-versicolor'), (6.3,2.5,4.9,1.5,'Iris-versicolor'), (6.1,2.8,4.7,1.2,'Iris-versicolor'), (6.4,2.9,4.3,1.3,'Iris-versicolor'), (6.6,3,4.4,1.4,'Iris-versicolor'), (6.8,2.8,4.8,1.4,'Iris-versicolor'), (6.7,3,5,1.7,'Iris-versicolor'), (6,2.9,4.5,1.5,'Iris-versicolor'), (5.7,2.6,3.5,1,'Iris-versicolor'), (5.5,2.4,3.8,1.1,'Iris-versicolor'), (5.5,2.4,3.7,1,'Iris-versicolor'), (5.8,2.7,3.9,1.2,'Iris-versicolor'), (6,2.7,5.1,1.6,'Iris-versicolor'), (5.4,3,4.5,1.5,'Iris-versicolor'), (6,3.4,4.5,1.6,'Iris-versicolor'), (6.7,3.1,4.7,1.5,'Iris-versicolor'), (6.3,2.3,4.4,1.3,'Iris-versicolor'), (5.6,3,4.1,1.3,'Iris-versicolor'), (5.5,2.5,4,1.3,'Iris-versicolor'), (5.5,2.6,4.4,1.2,'Iris-versicolor'), (6.1,3,4.6,1.4,'Iris-versicolor'), (5.8,2.6,4,1.2,'Iris-versicolor'), (5,2.3,3.3,1,'Iris-versicolor'), (5.6,2.7,4.2,1.3,'Iris-versicolor'), (5.7,3,4.2,1.2,'Iris-versicolor'), (5.7,2.9,4.2,1.3,'Iris-versicolor'), (6.2,2.9,4.3,1.3,'Iris-versicolor'), (5.1,2.5,3,1.1,'Iris-versicolor'), (5.7,2.8,4.1,1.3,'Iris-versicolor'), (6.3,3.3,6,2.5,'Iris-virginica'), (5.8,2.7,5.1,1.9,'Iris-virginica'), (7.1,3,5.9,2.1,'Iris-virginica'), (6.3,2.9,5.6,1.8,'Iris-virginica'), (6.5,3,5.8,2.2,'Iris-virginica'), (7.6,3,6.6,2.1,'Iris-virginica'), (4.9,2.5,4.5,1.7,'Iris-virginica'), (7.3,2.9,6.3,1.8,'Iris-virginica'), (6.7,2.5,5.8,1.8,'Iris-virginica'), (7.2,3.6,6.1,2.5,'Iris-virginica'), (6.5,3.2,5.1,2,'Iris-virginica'), (6.4,2.7,5.3,1.9,'Iris-virginica'), (6.8,3,5.5,2.1,'Iris-virginica'), (5.7,2.5,5,2,'Iris-virginica'), (5.8,2.8,5.1,2.4,'Iris-virginica'), (6.4,3.2,5.3,2.3,'Iris-virginica'), (6.5,3,5.5,1.8,'Iris-virginica'), (7.7,3.8,6.7,2.2,'Iris-virginica'), (7.7,2.6,6.9,2.3,'Iris-virginica'), (6,2.2,5,1.5,'Iris-virginica'), (6.9,3.2,5.7,2.3,'Iris-virginica'), (5.6,2.8,4.9,2,'Iris-virginica'), (7.7,2.8,6.7,2,'Iris-virginica'), (6.3,2.7,4.9,1.8,'Iris-virginica'), (6.7,3.3,5.7,2.1,'Iris-virginica'), (7.2,3.2,6,1.8,'Iris-virginica'), (6.2,2.8,4.8,1.8,'Iris-virginica'), (6.1,3,4.9,1.8,'Iris-virginica'), (6.4,2.8,5.6,2.1,'Iris-virginica'), (7.2,3,5.8,1.6,'Iris-virginica'), (7.4,2.8,6.1,1.9,'Iris-virginica'), (7.9,3.8,6.4,2,'Iris-virginica'), (6.4,2.8,5.6,2.2,'Iris-virginica'), (6.3,2.8,5.1,1.5,'Iris-virginica'), (6.1,2.6,5.6,1.4,'Iris-virginica'), (7.7,3,6.1,2.3,'Iris-virginica'), (6.3,3.4,5.6,2.4,'Iris-virginica'), (6.4,3.1,5.5,1.8,'Iris-virginica'), (6,3,4.8,1.8,'Iris-virginica'), (6.9,3.1,5.4,2.1,'Iris-virginica'), (6.7,3.1,5.6,2.4,'Iris-virginica'), (6.9,3.1,5.1,2.3,'Iris-virginica'), (5.8,2.7,5.1,1.9,'Iris-virginica'), (6.8,3.2,5.9,2.3,'Iris-virginica'), (6.7,3.3,5.7,2.5,'Iris-virginica'), (6.7,3,5.2,2.3,'Iris-virginica'), (6.3,2.5,5,1.9,'Iris-virginica'), (6.5,3,5.2,2,'Iris-virginica'), (6.2,3.4,5.4,2.3,'Iris-virginica'), (5.9,3,5.1,1.8,'Iris-virginica');
迭代 Series
Series 本身是一个可迭代对象,可直接对 Series 使用 for
语句来遍历它的值。
import pandas as pd df = pd.DataFrame( [ ['liver','E',89,21,24,64], ['Arry','C',36,37,37,57], ['Ack','A',57,60,18,84], ['Eorge','C',93,96,71,78], ['Oah','D',65,49,61,86] ], columns = ['name','team','Q1','Q2','Q3','Q4'] ) # 迭代指定的列 for i in df.name: print(i) # df.name.values 返回 array 结构数据可用于迭代;返回结果与上面的语句相同 for i in df.name.values: print(i)
迭代索引和指定的多列,使用 python 内置的 zip
函数将其打包为可迭代的 zip 对象。
import pandas as pd df = pd.DataFrame( [ ['liver','E',89,21,24,64], ['Arry','C',36,37,37,57], ['Ack','A',57,60,18,84], ['Eorge','C',93,96,71,78], ['Oah','D',65,49,61,86] ], columns = ['name','team','Q1','Q2','Q3','Q4'] ) # 迭代索引和指定的两列 for i, n, q in zip(df.index, df.name, df.Q1): print(i, n, q)
from odps import ODPS from odps.df import DataFrame import pandas as pd iris = o.get_table('pyodps_iris').to_df().head(5).to_pandas() print(type(iris)) print('----------------------------------------------------') print(iris) print('----------------------------------------------------') for i in iris.name.values: print(i) print('----------------------------------------------------') for index, col1, col2 in zip(iris.index, iris.name, iris.sepallength): print(index, col1, col2)
迭代 DataFrame
DataFrame 是 Pandas 库中最常用的数据结构之一,迭代 DataFrame 通常分为两种方式:按列迭代和按行迭代。
按列迭代指的是按照列的顺序逐个迭代 DataFrame 中的数据。可以使用 DataFrame 的 column 属性获取列名列表,并通过 for
循环遍历每个列名,然后再使用列名作为索引访问 DataFrame 的每列数据。
按行迭代指的是按照行的顺序逐个迭代 DataFrame 中的数据。可以使用 DataFrame 的 iterrows()
方法来实现迭代,该方法返回一个生成器,每次迭代返回一个元祖,其中包含行索引和行数据。
使用 Dataframe 的 index 属性
pandas.DataFrame.index 是 DataFrame 对象的属性,用于获取 DataFrame 的行索引。
from odps import ODPS from odps.df import DataFrame import pandas as pd iris = o.get_table('pyodps_iris').to_df().head(5).to_pandas() # 行索引 print(iris.index) # 按照行索引循环,获取 'name' 列和 'sepallength' 列 for i in iris.index: print(iris['name'][i], iris['sepallength'][i])
使用 DataFrame 的 loc 标签索引
loc
是 pandas.DataFrame 的一个属性,用于根据行/列标签从 DataFrame 中访问元素。
loc
以行标签和列标签作为参数,抽取整行或整列数据,当只有一个参数时,默认是行标签。
from odps import ODPS from odps.df import DataFrame import pandas as pd iris = o.get_table('pyodps_iris').to_df().head(5).to_pandas() for i in range(len(iris)): print(iris.loc[i, 'name'], iris.loc[i, 'sepallength'])
使用 DataFrame 的 iloc 位置索引
iloc
以行和列位置索引作为参数,0 表示第一行,1 表示第二行。抽取整行或整列数据,当只有一个参数时,默认是行索引。
from odps import ODPS from odps.df import DataFrame import pandas as pd iris = o.get_table('pyodps_iris').to_df().head(5).to_pandas() for i in range(len(iris)): print(iris.iloc[i, 4], iris.iloc[i, 0], iris.iloc[i, 2])
iteritems()
iteritems()
按列遍历,将 DataFrame 的每一列迭代为(列名, Series)对,可以通过 row[index]
对元素进行访问。
from odps import ODPS from odps.df import DataFrame import pandas as pd iris = o.get_table('pyodps_iris').to_df().head(3).to_pandas() for index, row in iris.iteritems(): # 输出每列列名 print(index) # 输出每列的值 print(row) # 仅输出每列第一行的值 print(row[0]) print('-----------------------')
iterrows()
iterrows()
按行遍历,将 DataFrame 的每一行迭代为(index, Series)对,可以通过 row[name]
对元素进行访问。
from odps import ODPS from odps.df import DataFrame import pandas as pd iris = o.get_table('pyodps_iris').to_df().head(3).to_pandas() for index, row in iris.iterrows(): # 输出每行的索引值 print(index) # 输出每行的值 print(row) # 输出每一行 'name'、'sepallength' 两列的值 print(row['name'], row['sepallength']) print('-----------------------')
itertuples()
itertuples()
按行遍历,将 DataFrame 的每一行迭代为元祖,可以通过 row[name]
对元素进行访问,比 iterrows()
效率高。
from odps import ODPS from odps.df import DataFrame import pandas as pd iris = o.get_table('pyodps_iris').to_df().head(3).to_pandas() for row in iris.itertuples(): # 输出每行的值 print(row) # 输出每一行 'name'、'sepallength' 两列的值 print(getattr(row, 'name'), getattr(row, 'sepallength')) print('-----------------------')
循环更新值
iterrows()
方法逐行检索值,返回一个副本,而不是视图,因此更改 pandas.Series 不会更新原始数据。
from odps import ODPS from odps.df import DataFrame import pandas as pd iris = o.get_table('pyodps_iris').to_df().head(3).to_pandas() for index, row in iris.iterrows(): row['sepallength'] += row['sepallength'] print(iris)
使用 at[]
选择并处理原始 DataFrame 中的数据,用于更新。
from odps import ODPS from odps.df import DataFrame import pandas as pd iris = o.get_table('pyodps_iris').to_df().head(3).to_pandas() for index, row in iris.iterrows(): iris.at[index, 'sepallength'] += row['sepallength'] print(iris)
参考资料
深入浅出 Pandas > Pandas 高级操作 > 数据迭代
原创文章,转载请注明出处:http://www.opcoder.cn/article/69/