print (df) val 0 HF - Antartica 1 HF - America 2 HF - Asia print (df.val.replace({'HF -':'Hi'}, regex=True)) 0 Hi Antartica 1 Hi America 2 Hi Asia Name: val, dtype: object
1
2
3
4
5
6
7
8
9
10
11
|
print
(
df
)
val
0
HF
-
Antartica
1
HF
-
America
2
HF
-
Asia
print
(
df
.
val
.
replace
(
{
'HF -'
:
'Hi'
}
,
regex
=
True
)
)
0
Hi
Antartica
1
Hi
America
2
Hi
Asia
Name
:
val
,
dtype
:
object
|
>df = pd.DataFrame({'Column': ['HF - Antartica', 'HF - America', 'HF - Asia']}) >df.replace('HF', 'Hi Funny', regex=True) Column 0 Hi Funny - Antartica 1 Hi Funny - America 2 Hi Funny - Asia
1
2
3
4
5
6
7
|
>
df
=
pd
.
DataFrame
(
{
'Column'
:
[
'HF - Antartica'
,
'HF - America'
,
'HF - Asia'
]
}
)
>
df
.
replace
(
'HF'
,
'Hi Funny'
,
regex
=
True
)
Column
0
Hi
Funny
-
Antartica
1
Hi
Funny
-
America
2
Hi
Funny
-
Asia
|
http://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.replace.html
https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.replace.html
字符串函数
# SELECT CONCAT(origin, ' to ', destination) df['origin'].str.cat(df['destination'], sep=' to ') df['origin'].str.strip() # TRIM(origin) df['origin'].str.len() # LENGTH(origin) df['origin'].str.replace('a', 'b') # REPLACE(origin, 'a', 'b') # SELECT SUBSTRING(origin, 1, 1) df['origin'].str[0:1] # 使用 Python 字符串索引 # SELECT SUBSTRING_INDEX(domain, '.', 2) # www.example.com -> www.example df['domain'].str.split('.').str[:2].str.join('.') df['domain'].str.extract(r'^([^.]+\.[^.]+)') s.str.split('')#.split('')切割字符串('这里写分隔符') s.str.replace('A','Z')#.replace()将字符串替换成另外的 s1 = pd.Series(['a1','a2','a3','a4'])#创建一个新的序列 s1.str.extract('([abc])(\d)')#提取 s1.str.extract('[ab](\d)')#提取 s2 = pd.Series(['a','B','c','d']) a_z = r'[a-z]'#正则表达式 s2.str.contains(a_z)#.contains()检测列表里面有没有包含指定的内容
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
|
# SELECT CONCAT(origin, ' to ', destination)
df
[
'origin'
]
.
str
.
cat
(
df
[
'destination'
]
,
sep
=
' to '
)
df
[
'origin'
]
.
str
.
strip
(
)
# TRIM(origin)
df
[
'origin'
]
.
str
.
len
(
)
# LENGTH(origin)
df
[
'origin'
]
.
str
.
replace
(
'a'
,
'b'
)
# REPLACE(origin, 'a', 'b')
# SELECT SUBSTRING(origin, 1, 1)
df
[
'origin'
]
.
str
[
0
:
1
]
# 使用 Python 字符串索引
# SELECT SUBSTRING_INDEX(domain, '.', 2)
# www.example.com -> www.example
df
[
'domain'
]
.
str
.
split
(
'.'
)
.
str
[
:
2
]
.
str
.
join
(
'.'
)
df
[
'domain'
]
.
str
.
extract
(
r
'^([^.]+\.[^.]+)'
)
s
.
str
.
split
(
''
)
#.split('')切割字符串('这里写分隔符')
s
.
str
.
replace
(
'A'
,
'Z'
)
#.replace()将字符串替换成另外的
s1
=
pd
.
Series
(
[
'a1'
,
'a2'
,
'a3'
,
'a4'
]
)
#创建一个新的序列
s1
.
str
.
extract
(
'([abc])(\d)'
)
#提取
s1
.
str
.
extract
(
'[ab](\d)'
)
#提取
s2
=
pd
.
Series
(
[
'a'
,
'B'
,
'c'
,
'd'
]
)
a_z
=
r
'[a-z]'
#正则表达式
s2
.
str
.
contains
(
a_z
)
#.contains()检测列表里面有没有包含指定的内容
|
日期函数
pd.to_datetime
用于将各种日期字符串转换成标准的 datetime64
类型。日期类型的 Series 都会有一个 dt
属性,从中可以获取到有关日期时间的信息,具体请参考文档 Time Series / Date functionality。
# SELECT STR_TO_DATE(full_date, '%Y%m%d%H%i%s') AS `datetime` df['datetime'] = pd.to_datetime(df['full_date'], format='%Y%m%d%H%M%S') # SELECT DATE_FORMAT(`datetime`, '%Y-%m-%d') df['datetime'].dt.strftime('%Y-%m-%d') df['datetime'].dt.month # MONTH(`datetime`) df['datetime'].dt.hour # HOUR(`datetime`) # SELECT UNIX_TIMESTAMP(`datetime`) df['datetime'].view('int64') // pd.Timedelta(1, unit='s').value # SELECT FROM_UNIXTIME(`timestamp`) pd.to_datetime(df['timestamp'], unit='s') # SELECT `datetime` + INTERVAL 1 DAY df['datetime'] + pd.Timedelta(1, unit='D')
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
|
# SELECT STR_TO_DATE(full_date, '%Y%m%d%H%i%s') AS `datetime`
df
[
'datetime'
]
=
pd
.
to_datetime
(
df
[
'full_date'
]
,
format
=
'%Y%m%d%H%M%S'
)
# SELECT DATE_FORMAT(`datetime`, '%Y-%m-%d')
df
[
'datetime'
]
.
dt
.
strftime
(
'%Y-%m-%d'
)
df
[
'datetime'
]
.
dt
.
month
# MONTH(`datetime`)
df
[
'datetime'
]
.
dt
.
hour
# HOUR(`datetime`)
# SELECT UNIX_TIMESTAMP(`datetime`)
df
[
'datetime'
]
.
view
(
'int64'
)
/
/
pd
.
Timedelta
(
1
,
unit
=
's'
)
.
value
# SELECT FROM_UNIXTIME(`timestamp`)
pd
.
to_datetime
(
df
[
'timestamp'
]
,
unit
=
's'
)
# SELECT `datetime` + INTERVAL 1 DAY
df
[
'datetime'
]
+
pd
.
Timedelta
(
1
,
unit
=
'D'
)
|