3.10. String functions on Columns#

3.10.1. EDA for columns#

import pandas as pd
import numpy as np
df=pd.DataFrame({
'First Name':['Sahil','Sonia','Sourav','Vishal'],
'Age':[10,20,30,40],
'Gender':['M','F','M','M'],
'City':['J','K','L','P'],
'Place of Work':[True,False,False,True],
}
)
df
First Name Age Gender City Place of Work
0 Sahil 10 M J True
1 Sonia 20 F K False
2 Sourav 30 M L False
3 Vishal 40 M P True

3.10.1.1. Get columns as list#

df.columns.tolist()
['First Name', 'Age', 'Gender', 'City', 'Place of Work']

3.10.1.2. Convert column names to series | df:#

df.columns.to_series()
First Name          First Name
Age                        Age
Gender                  Gender
City                      City
Place of Work    Place of Work
dtype: object
df.columns.to_frame()
0
First Name First Name
Age Age
Gender Gender
City City
Place of Work Place of Work

3.10.1.3. Check if specific column is there or not#

df.columns.str.contains('Name') 
array([ True, False, False, False, False])

3.10.1.4. Check if any duplicate column is there#

df.columns.duplicated()
array([False, False, False, False, False])

3.10.1.5. Check methods/attributes of String#

dir(df.columns.str)[0:5]
['__annotations__', '__class__', '__delattr__', '__dict__', '__dir__']

3.10.1.6. Make column names to lower case#

df.columns.str.lower()
Index(['first name', 'age', 'gender', 'city', 'place of work'], dtype='object')

3.10.1.7. Make column names to Upper case#

df.columns.str.upper()
Index(['FIRST NAME', 'AGE', 'GENDER', 'CITY', 'PLACE OF WORK'], dtype='object')

3.10.1.8. Make column names to Title case#

df.columns.str.title()  # Camel Case
Index(['First Name', 'Age', 'Gender', 'City', 'Place Of Work'], dtype='object')

3.10.1.9. Make column names to Capitalize#

df.columns.str.capitalize() # Only first letter big
Index(['First name', 'Age', 'Gender', 'City', 'Place of work'], dtype='object')

3.10.1.10. Replace empty spaces with underscores#

df.columns.str.replace(' ','-')
Index(['First-Name', 'Age', 'Gender', 'City', 'Place-of-Work'], dtype='object')

3.10.1.11. Rename columns#

df.rename(columns={'oldname':'newname'},inplace=True)

3.10.1.12. Check total number of columns#

len(df.columns)
5

3.10.1.13. Select particular columns#

df.columns.values[0:4]
array(['First Name', 'Age', 'Gender', 'City'], dtype=object)

3.10.1.14. Get 2nd column and rename it#

df.columns.values[2]='DOB'
df
First Name Age DOB City Place of Work
0 Sahil 10 M J True
1 Sonia 20 F K False
2 Sourav 30 M L False
3 Vishal 40 M P True

3.10.1.15. Select all columns except one#

df.columns[df.columns!= 'DOB']
Index(['First Name', 'Age', 'City', 'Place of Work'], dtype='object')
df
First Name Age DOB City Place of Work
0 Sahil 10 M J True
1 Sonia 20 F K False
2 Sourav 30 M L False
3 Vishal 40 M P True

3.10.1.16. Select all columns except multiple#

#?
df.loc[:,-df.columns.isin(['DOB','City']).columns]
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
Input In [21], in <cell line: 2>()
      1 #?
----> 2 df.loc[:,-df.columns.isin(['DOB','City']).columns]

AttributeError: 'numpy.ndarray' object has no attribute 'columns'

3.10.1.17. Select column names that begins with particular word#

df.columns.str.startswith('First')
# Gives array of booleans
array([ True, False, False, False, False])

3.10.1.18. Select group of column names#

df.columns.values[[0,1,2]]
array(['First Name', 'Age', 'DOB'], dtype=object)
df.columns[0:3]
Index(['First Name', 'Age', 'DOB'], dtype='object')