반응형
In [1]:
import pandas as pd
df = pd.read_csv('C:\\Users\\rladl\\Jupyter.study\\05000266\\part3\\auto-mpg.csv', header=None)
df.columns = ['mpg','cylinders','displacement','horsepower','weight','acceleration','model year','origin','name']
print(df.head())
print('\n')
print(df.tail())
mpg cylinders displacement horsepower weight acceleration model year \
0 18.0 8 307.0 130.0 3504.0 12.0 70
1 15.0 8 350.0 165.0 3693.0 11.5 70
2 18.0 8 318.0 150.0 3436.0 11.0 70
3 16.0 8 304.0 150.0 3433.0 12.0 70
4 17.0 8 302.0 140.0 3449.0 10.5 70
origin name
0 1 chevrolet chevelle malibu
1 1 buick skylark 320
2 1 plymouth satellite
3 1 amc rebel sst
4 1 ford torino
mpg cylinders displacement horsepower weight acceleration \
393 27.0 4 140.0 86.00 2790.0 15.6
394 44.0 4 97.0 52.00 2130.0 24.6
395 32.0 4 135.0 84.00 2295.0 11.6
396 28.0 4 120.0 79.00 2625.0 18.6
397 31.0 4 119.0 82.00 2720.0 19.4
model year origin name
393 82 1 ford mustang gl
394 82 2 vw pickup
395 82 1 dodge rampage
396 82 1 ford ranger
397 82 1 chevy s-10
데이터 요약 정보 확인하기¶
In [2]:
# df의 모양과 크기 확인 : (행의 개수, 열의 개수)를 튜플로 반환
print(df.shape)
(398, 9)
In [3]:
# 데이터 프레임의 df의 내용 확인
print(df.info())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 mpg 398 non-null float64
1 cylinders 398 non-null int64
2 displacement 398 non-null float64
3 horsepower 398 non-null object
4 weight 398 non-null float64
5 acceleration 398 non-null float64
6 model year 398 non-null int64
7 origin 398 non-null int64
8 name 398 non-null object
dtypes: float64(4), int64(3), object(2)
memory usage: 28.1+ KB
None
In [4]:
# 데이터 프레임 df의 자료형 확인
print(df.dtypes)
print('\n')
print(df.mpg.dtypes)
mpg float64
cylinders int64
displacement float64
horsepower object
weight float64
acceleration float64
model year int64
origin int64
name object
dtype: object
float64
In [6]:
print(df.describe)
print('\n')
print(df.describe(include='all'))
# unique : 고유값 개수 top : 최빈값 freq : 빈도수
<bound method NDFrame.describe of mpg cylinders displacement horsepower weight acceleration \
0 18.0 8 307.0 130.0 3504.0 12.0
1 15.0 8 350.0 165.0 3693.0 11.5
2 18.0 8 318.0 150.0 3436.0 11.0
3 16.0 8 304.0 150.0 3433.0 12.0
4 17.0 8 302.0 140.0 3449.0 10.5
.. ... ... ... ... ... ...
393 27.0 4 140.0 86.00 2790.0 15.6
394 44.0 4 97.0 52.00 2130.0 24.6
395 32.0 4 135.0 84.00 2295.0 11.6
396 28.0 4 120.0 79.00 2625.0 18.6
397 31.0 4 119.0 82.00 2720.0 19.4
model year origin name
0 70 1 chevrolet chevelle malibu
1 70 1 buick skylark 320
2 70 1 plymouth satellite
3 70 1 amc rebel sst
4 70 1 ford torino
.. ... ... ...
393 82 1 ford mustang gl
394 82 2 vw pickup
395 82 1 dodge rampage
396 82 1 ford ranger
397 82 1 chevy s-10
[398 rows x 9 columns]>
mpg cylinders displacement horsepower weight \
count 398.000000 398.000000 398.000000 398 398.000000
unique NaN NaN NaN 94 NaN
top NaN NaN NaN 150.0 NaN
freq NaN NaN NaN 22 NaN
mean 23.514573 5.454774 193.425879 NaN 2970.424623
std 7.815984 1.701004 104.269838 NaN 846.841774
min 9.000000 3.000000 68.000000 NaN 1613.000000
25% 17.500000 4.000000 104.250000 NaN 2223.750000
50% 23.000000 4.000000 148.500000 NaN 2803.500000
75% 29.000000 8.000000 262.000000 NaN 3608.000000
max 46.600000 8.000000 455.000000 NaN 5140.000000
acceleration model year origin name
count 398.000000 398.000000 398.000000 398
unique NaN NaN NaN 305
top NaN NaN NaN ford pinto
freq NaN NaN NaN 6
mean 15.568090 76.010050 1.572864 NaN
std 2.757689 3.697627 0.802055 NaN
min 8.000000 70.000000 1.000000 NaN
25% 13.825000 73.000000 1.000000 NaN
50% 15.500000 76.000000 1.000000 NaN
75% 17.175000 79.000000 2.000000 NaN
max 24.800000 82.000000 3.000000 NaN
데이터 개수 확인¶
In [7]:
print(df.count())
print('\n')
print(type(df.count()))
mpg 398
cylinders 398
displacement 398
horsepower 398
weight 398
acceleration 398
model year 398
origin 398
name 398
dtype: int64
<class 'pandas.core.series.Series'>
In [8]:
unique_values = df['origin'].value_counts()
print(unique_values)
print('\n')
print(type(unique_values))
1 249
3 79
2 70
Name: origin, dtype: int64
<class 'pandas.core.series.Series'>
In [10]:
# 평균값
print(df.mean())
print('\n')
print(df['mpg'].mean())
print('\n')
print(df[['mpg','weight']].mean())
mpg 23.514573
cylinders 5.454774
displacement 193.425879
weight 2970.424623
acceleration 15.568090
model year 76.010050
origin 1.572864
dtype: float64
23.514572864321615
mpg 23.514573
weight 2970.424623
dtype: float64
C:\Users\rladl\AppData\Local\Temp/ipykernel_14012/2164482834.py:2: FutureWarning: Dropping of nuisance columns in DataFrame reductions (with 'numeric_only=None') is deprecated; in a future version this will raise TypeError. Select only valid columns before calling the reduction.
print(df.mean())
In [11]:
# 중간값
print(df.median())
print('\n')
print(df['mpg'].median())
mpg 23.0
cylinders 4.0
displacement 148.5
weight 2803.5
acceleration 15.5
model year 76.0
origin 1.0
dtype: float64
23.0
C:\Users\rladl\AppData\Local\Temp/ipykernel_14012/28750081.py:2: FutureWarning: Dropping of nuisance columns in DataFrame reductions (with 'numeric_only=None') is deprecated; in a future version this will raise TypeError. Select only valid columns before calling the reduction.
print(df.median())
In [12]:
# 최대값
print(df.max())
print('\n')
# 최소값
print(df.min())
mpg 46.6
cylinders 8
displacement 455.0
horsepower ?
weight 5140.0
acceleration 24.8
model year 82
origin 3
name vw rabbit custom
dtype: object
mpg 9.0
cylinders 3
displacement 68.0
horsepower 100.0
weight 1613.0
acceleration 8.0
model year 70
origin 1
name amc ambassador brougham
dtype: object
In [13]:
# 표준편차
print(df.std())
print('\n')
# 상관계수
print(df.corr())
mpg 7.815984
cylinders 1.701004
displacement 104.269838
weight 846.841774
acceleration 2.757689
model year 3.697627
origin 0.802055
dtype: float64
mpg cylinders displacement weight acceleration \
mpg 1.000000 -0.775396 -0.804203 -0.831741 0.420289
cylinders -0.775396 1.000000 0.950721 0.896017 -0.505419
displacement -0.804203 0.950721 1.000000 0.932824 -0.543684
weight -0.831741 0.896017 0.932824 1.000000 -0.417457
acceleration 0.420289 -0.505419 -0.543684 -0.417457 1.000000
model year 0.579267 -0.348746 -0.370164 -0.306564 0.288137
origin 0.563450 -0.562543 -0.609409 -0.581024 0.205873
model year origin
mpg 0.579267 0.563450
cylinders -0.348746 -0.562543
displacement -0.370164 -0.609409
weight -0.306564 -0.581024
acceleration 0.288137 0.205873
model year 1.000000 0.180662
origin 0.180662 1.000000
C:\Users\rladl\AppData\Local\Temp/ipykernel_14012/2183833806.py:2: FutureWarning: Dropping of nuisance columns in DataFrame reductions (with 'numeric_only=None') is deprecated; in a future version this will raise TypeError. Select only valid columns before calling the reduction.
print(df.std())
판다스 내장 그래프 도구 활용¶
In [2]:
df = pd.read_excel('C:\\Users\\rladl\\Jupyter.study\\05000266\part3\\남북한발전전력량.xlsx',engine='openpyxl')
df_ns = df.iloc[[0,5],3:] # 남한, 북한 발전량 합계 데이터만 추출
df_ns.index = ['South','North'] # 행 인덱스 변경
df_ns.columns = df_ns.columns.map(int) # 열 이름의 자료형을 정수형으로 변경
print(df_ns.head())
1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 ... 2007 \
South 1186 1310 1444 1650 1847 2055 2244 2153 2393 2664 ... 4031
North 263 247 221 231 230 213 193 170 186 194 ... 236
2008 2009 2010 2011 2012 2013 2014 2015 2016
South 4224 4336 4747 4969 5096 5171 5220 5281 5404
North 255 235 237 211 215 221 216 190 239
[2 rows x 26 columns]
In [17]:
tdf_ns = df_ns.T
print(tdf_ns.head())
print('\n')
tdf_ns.plot()
South North
1991 1186 263
1992 1310 247
1993 1444 221
1994 1650 231
1995 1847 230
Out[17]:
<AxesSubplot:>
In [19]:
print(tdf_ns.head())
print('\n')
tdf_ns.plot(kind='bar')
South North
1991 1186 263
1992 1310 247
1993 1444 221
1994 1650 231
1995 1847 230
Out[19]:
<AxesSubplot:>
In [20]:
tdf_ns.plot(kind='hist')
Out[20]:
<AxesSubplot:ylabel='Frequency'>
In [21]:
df = pd.read_csv('C:\\Users\\rladl\\Jupyter.study\\05000266\\part3\\auto-mpg.csv')
df.columns = ['mpg','cylinders','displacement','horsepower','weight','acceleration','model year','origin','name']
df.plot(x='weight',y='mpg',kind='scatter')
Out[21]:
<AxesSubplot:xlabel='weight', ylabel='mpg'>
In [22]:
df[['mpg','cylinders']].plot(kind='box')
Out[22]:
<AxesSubplot:>
반응형
'Python > Pandas' 카테고리의 다른 글
[Pandas] dtype 기반 열 선택 / select_dtypes (0) | 2022.06.06 |
---|---|
[pandas] pivot_table 원하는 대로 테이블 만들기 (0) | 2022.05.18 |
[pandas] rename, reset_index (0) | 2022.05.17 |
[pandas] 레이블인코딩 / 원핫 인코딩 / 문자열을 범주형 데이터로 바꾸기 (0) | 2022.03.05 |
[Pandas] series / dataframe / 데이터구조 다루기 (0) | 2022.02.21 |