Pandas之數據合併(concat、merge)詳解

2020-08-08 21:57:43

數據合併

#準備數據
import numpy as np
import pandas as pd
def make_df(cols,ind):
    '''生成一個簡單的DataFrame數據'''
    data = {c: [str(c) + str(i) for i in ind] for c in cols}
    return pd.DataFrame(data,ind)
#測試函數
df = make_df("ABC", range(5))
print(df)

在这里插入图片描述

concat

pd.concat實現Series的合併
import pandas as pd
#簡單的合併
s1 = pd. Series(list("ABC"), index = [1,2,3])
s2 = pd. Series(list("DEF"), index =[4,5,6])
s = pd.concat([s1, s2] )
print("合併後: \n", s)

在这里插入图片描述

pd.concat實現DataFrame的合併
import pandas as pd
def make_df(cols, index):
    """一個簡單的DataFrame"""
    data = {c:[str(c)+str(i) for i in index] for c in cols}
    return pd.DataFrame(data, index)
# DF合併
df1 = make_df("ABC", [1,2,3] )
df2 = make_df("DEF", [4,5,6] )
print("df1 = \n", df1)
print("\n df2 = \n", df2)

在这里插入图片描述

merge

import pandas as pd
##準備數據
df1 = pd. DataFrame({"name":list("ABCD"),"group":["I", "II", "III", "II"]})
df2 = pd. DataFrame( {"name": list ("ABCD"),"score": [61,78,74,98]})
df3 = pd. DataFrame({"group":["I", "II", "III"], "leader": ["Alice", "Bob", "Cindy"]})
#此數據結構意味着每個組需要掌握的專業技能,一 組需要有多個技能用重複值表示
df4 = pd. DataFrame({"group":["I", "I", "II","II", "II", "III","III"],"skills": ["Linux", "Python", "Java" ,"Math", "English", "C++", "PHP"]})
print("df1 = \n",df1)
print("\n df2 = \n", df2)
print("\n df3 = \n",df3)
print("\n df4 = \n",df4)

在这里插入图片描述

一對一的連線
import pandas as pd
##準備數據
df1 = pd. DataFrame({"name":list("ABCD"),"group":["I", "II", "III", "II"]})
df2 = pd. DataFrame( {"name": list ("ABCD"),"score": [61,78,74,98]})

df5=pd.merge(df1,df2)
print("df5 = \n",df5)

在这里插入图片描述

多對一的連線
import pandas as pd
##準備數據
df1 = pd. DataFrame({"name":list("ABCD"),"group":["I", "II", "III", "II"]})
df2 = pd. DataFrame( {"name": list ("ABCD"),"score": [61,78,74,98]})
df3 = pd. DataFrame({"group":["I", "II", "III"], "leader": ["Alice", "Bob", "Cindy"]})

df5=pd.merge(df1,df3)
print("df5 = \n",df5)

在这里插入图片描述

多對多的連線
import pandas as pd
##準備數據
df1 = pd. DataFrame({"name":list("ABCD"),"group":["I", "II", "III", "II"]})
df2 = pd. DataFrame( {"name": list ("ABCD"),"score": [61,78,74,98]})
df3 = pd. DataFrame({"group":["I", "II", "III"], "leader": ["Alice", "Bob", "Cindy"]})
#此數據結構意味着每個組需要掌握的專業技能,一 組需要有多個技能用重複值表示
df4 = pd. DataFrame({"group":["I", "I", "II","II", "II", "III","III"],"skills": ["Linux", "Python", "Java" ,"Math", "English", "C++", "PHP"]})

df5=pd.merge(df1,df4)
print("df5 = \n",df5)

在这里插入图片描述

on參數的應用

代表着起作用的那個

import pandas as pd
##準備數據
df1 = pd. DataFrame({"name":list("ABCD"),"group":["I", "II", "III", "II"]})
df2 = pd. DataFrame( {"name": list ("ABCD"),"score": [61,78,74,98]})
df3 = pd. DataFrame({"group":["I", "II", "III"], "leader": ["Alice", "Bob", "Cindy"]})
#此數據結構意味着每個組需要掌握的專業技能,一 組需要有多個技能用重複值表示
df4 = pd. DataFrame({"group":["I", "I", "II","II", "II", "III","III"],"skills": ["Linux", "Python", "Java" ,"Math", "English", "C++", "PHP"]})

df5=pd.merge(df1,df2,on="name")
print("df5 = \n",df5)

在这里插入图片描述

left_on和right_on參數的應用
import pandas as pd
##準備數據
df1 = pd. DataFrame({"name":list("ABCD"),"group":["I", "II", "III", "II"]})
df2 = pd. DataFrame( {"my_name": list ("ABCD"),"score": [61,78,74,98]})
df3 = pd. DataFrame({"group":["I", "II", "III"], "leader": ["Alice", "Bob", "Cindy"]})
#此數據結構意味着每個組需要掌握的專業技能,一 組需要有多個技能用重複值表示
df4 = pd. DataFrame({"group":["I", "I", "II","II", "II", "III","III"],"skills": ["Linux", "Python", "Java" ,"Math", "English", "C++", "PHP"]})

df5=pd.merge(df1,df2,left_on="name",right_on="my_name")
print("df5 = \n",df5)
print("\n df5.drop = \n",df5.drop("my_name",axis=1))

在这里插入图片描述

left_index和right_index參數的說明
import pandas as pd
##準備數據
df1 = pd. DataFrame({"name":list("ABCD"),"group":["I", "II", "III", "II"]})
df2 = pd. DataFrame( {"my_name": list ("ABCD"),"score": [61,78,74,98]})
df3 = pd. DataFrame({"group":["I", "II", "III"], "leader": ["Alice", "Bob", "Cindy"]})
#此數據結構意味着每個組需要掌握的專業技能,一 組需要有多個技能用重複值表示
df4 = pd. DataFrame({"group":["I", "I", "II","II", "II", "III","III"],"skills": ["Linux", "Python", "Java" ,"Math", "English", "C++", "PHP"]})

df5=pd.merge(df1,df2,left_index=True,right_index=True)
print("df5 = \n",df5)

在这里插入图片描述

how參數的使用
  • 內連線: how=‘inner’, 此時結果只保留交集
  • 外連線: how=‘outer’,此時結果保留的是兩個數據集的並集
  • 左連線:how=‘left’,此時結果保留左側全部內容,有連線的右側內容也會保留
  • 右連線:how=‘right’,此時結果保留右側全部內容,有鏈接的左側內容也會保留
import pandas as pd
##準備數據
df1 = pd. DataFrame({"name":list("ABCD"),"group":[1,2,3,4]})
df2 = pd. DataFrame( {"name": list ("EFGH"),"score": [61,78,74,98]})

df5=pd.merge(df1,df2,how="outer")
print("df5 = \n",df5)

在这里插入图片描述

import pandas as pd
##準備數據
df1 = pd. DataFrame({"name":list("ABCD"),"group":[1,2,3,4]})
df2 = pd. DataFrame( {"name": list ("EFGH"),"score": [61,78,74,98]})

df5=pd.merge(df1,df2,how="inner")
print("df5 = \n",df5)

在这里插入图片描述