dataframe.groupby in Python Pandas

# importing pandas as pd 
import pandas as pd 

# Creating the dataframe 
df = pd.read_csv("nba.csv") 

# Print the dataframe 
df 
# applying groupby() function to 
# group the data on team value. 
gk = df.groupby('Team') 

# Let's print the first entries 
# in all the groups formed. 
gk.first() 
# Finding the values contained in the "Boston Celtics" group 
gk.get_group('Boston Celtics') 
# importing pandas as pd 
import pandas as pd 

# Creating the dataframe 
df = pd.read_csv("nba.csv") 

# First grouping based on "Team" 
# Within each team we are grouping based on "Position" 
gkk = daf.groupby(['Team', 'Position']) 

# Print the first value in each group 
gkk.first() 

References
https://www.geeksforgeeks.org/python-pandas-dataframe-groupby/

Querying data with Pandas .query() method

Single condition filtering

# importing pandas package 
import pandas as pd 

# making data frame from csv file 
data = pd.read_csv("employees.csv") 

# replacing blank spaces with '_' 
data.columns =[column.replace(" ", "_") for column in data.columns] 

# filtering with query method 
data.query('Senior_Management == True', inplace = True) 

# display 
data 

Multiple condition filtering

# importing pandas package 
import pandas as pd 

# making data frame from csv file 
data = pd.read_csv("employees.csv") 

# replacing blank spaces with '_' 
data.columns =[column.replace(" ", "_") for column in data.columns] 

# filtering with query method 
data.query('Senior_Management == True
      and Gender =="Male" and Team =="Marketing"
      and First_Name =="Johnny"', inplace = True) 

# display 
data 

References
https://www.geeksforgeeks.org/python-filtering-data-with-pandas-query-method/

Get unique values from a column in Pandas DataFrame

len(gapminder['country'].unique().tolist())
set(df['region'].values.tolist())
# Create a list of unique values by turning the
# pandas column into a set
list(set(df.trucks))
# Create a list of unique values in df.trucks
list(df['trucks'].unique())
# Import pandas package 
import pandas as pd 

# create a dictionary with five fields each 
data = { 
  'A':['A1', 'A2', 'A3', 'A4', 'A5'], 
  'B':['B1', 'B2', 'B3', 'B4', 'B4'], 
  'C':['C1', 'C2', 'C3', 'C3', 'C3'], 
  'D':['D1', 'D2', 'D2', 'D2', 'D2'], 
  'E':['E1', 'E1', 'E1', 'E1', 'E1'] } 

# Convert the dictionary into DataFrame 
df = pd.DataFrame(data) 

# Get the unique values of 'B' column 
df.B.unique() 
# Import pandas package 
import pandas as pd 

# create a dictionary with five fields each 
data = { 
  'A':['A1', 'A2', 'A3', 'A4', 'A5'], 
  'B':['B1', 'B2', 'B3', 'B4', 'B4'], 
  'C':['C1', 'C2', 'C3', 'C3', 'C3'], 
  'D':['D1', 'D2', 'D2', 'D2', 'D2'], 
  'E':['E1', 'E1', 'E1', 'E1', 'E1'] } 

# Convert the dictionary into DataFrame 
df = pd.DataFrame(data) 

# Get number of unique values in column 'C' 
df.C.nunique(dropna = True) 

References
https://pythonprogramming.net/graph-visualization-python3-pandas-data-analysis/
https://www.geeksforgeeks.org/get-unique-values-from-a-column-in-pandas-dataframe/
https://chrisalbon.com/python/data_wrangling/pandas_find_unique_values/
https://cmdlinetips.com/2018/01/how-to-get-unique-values-from-a-column-in-pandas-data-frame/

Sort rows or columns in Dataframe based on values in Pandas

DataFrame.sort_values(by, axis=0, ascending=True, inplace=False, kind='quicksort', na_position='last')

Sort Dataframe rows based on a single column

# Sort the rows of dataframe by column 'Name'
dfObj = dfObj.sort_values(by ='Name' )
print("Contents of Sorted Dataframe based on a single column 'Name' : ")
print(dfObj)

Sort Dataframe rows based on a multiple columns

dfObj = dfObj.sort_values(by =['Name', 'Marks'])
print("Contents of a Sorted Dataframe based on multiple columns 'Name' & 'Marks' : ")
print(dfObj)

Sort Dataframe rows based on columns in Descending Order

# Sort the rows of dataframe by column 'Name' in descending order
dfObj = dfObj.sort_values(by ='Name' , ascending=False)
print("Contents of Sorted Dataframe based on a column 'Name' in Descending Order : ")
print(dfObj)

Sort Dataframe rows based on a column in Place

# Sort the rows of dataframe by column 'Name' inplace
dfObj.sort_values(by='Name' , inplace=True)
print("Contents of Sorted Dataframe based on a single column 'Name' inplace: ")
print(dfObj)

Sort columns of a Dataframe based on a single row

dfObj = dfObj.sort_values(by ='b', axis=1)
print("Contents of Sorted Dataframe based on a single row index label 'b' ")
print(dfObj)

Sort columns of a Dataframe in Descending Order based on a single row

dfObj = dfObj.sort_values(by ='b', axis=1)
print("Contents of Sorted Dataframe based on a single row index label 'b' ")
print(dfObj)

Sort columns of a Dataframe based on a multiple rows

dfObj = dfObj.sort_values(by ='b', axis=1)
print("Contents of Sorted Dataframe based on a single row index label 'b' ")
print(dfObj)

Complete example is as follows:

import pandas as pd
def main():
# List of Tuples
students = [ ('Jack', 34, 'Sydney') ,
('Riti', 41, 'Delhi' ) ,
('Aadi', 16, 'New York') ,
('Riti', 22, 'Delhi' ) ,
('Riti', 35, 'Delhi' ) ,
('Riti', 40, 'Mumbai' )
]
# Create a DataFrame object
dfObj = pd.DataFrame(students, columns=['Name', 'Marks', 'City'], index=['b', 'a', 'f', 'e', 'd', 'c'])
print("Original Dataframe : ")
print(dfObj)
print('**** Sort Dataframe rows based on a single column ****')
# Sort the rows of dataframe by column 'Name'
dfObj = dfObj.sort_values(by ='Name' )
print("Contents of Sorted Dataframe based on a single column 'Name' : ")
print(dfObj)
print('**** Sort Dataframe rows based on a multiple columns ****')
dfObj = dfObj.sort_values(by =['Name', 'Marks'])
print("Contents of a Sorted Dataframe based on multiple columns 'Name' & 'Marks' : ")
print(dfObj)
print('**** Sort Dataframe rows based on a single column in Descending Order ****')
# Sort the rows of dataframe by column 'Name' in descending order
dfObj = dfObj.sort_values(by ='Name' , ascending=False)
print("Contents of Sorted Dataframe based on a column 'Name' in Descending Order : ")
print(dfObj)
print('**** Sort Dataframe rows based on a single column in place ****')
# Sort the rows of dataframe by column 'Name' inplace
dfObj.sort_values(by='Name' , inplace=True)
print("Contents of Sorted Dataframe based on a single column 'Name' inplace: ")
print(dfObj)
print('******** Sort columns of Dataframe based on a single or multiple rows ********')
# List of Tuples
matrix = [(222, 16, 23),
(333, 31, 11),
(444, 34, 11),
]
# Create a DataFrame object of 3X3 Matrix
dfObj = pd.DataFrame(matrix, index=list('abc'))
print("Original Dataframe: ")
print(dfObj)
# Sort columns of a dataframe based on a single row with index label 'b'
dfObj = dfObj.sort_values(by ='b', axis=1)
print("Contents of Sorted Dataframe based on a single row index label 'b' ")
print(dfObj)
# Sort columns of a dataframe in descending order based on a single row with index label 'b'
dfObj = dfObj.sort_values(by='b', axis=1, ascending=False)
print("Contents of Sorted Dataframe in descending order based on a single row index label 'b' ")
print(dfObj)
# Sort columns of a dataframe based on a multiple row with index labels 'b' & 'c'
dfObj = dfObj.sort_values(by =['b' , 'c' ], axis=1)
print("Contents of Sorted Dataframe based on multiple rows index label 'b' & 'c' ")
print(dfObj)
if __name__ == '__main__':
main()

References
https://thispointer.com/pandas-sort-rows-or-columns-in-dataframe-based-on-values-using-dataframe-sort_values/
https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.sort_values.html

Append existing excel sheet with new Dataframe using Python Pandas

def append_df_to_excel(filename, df, sheet_name='Sheet1', startrow=None,
                       truncate_sheet=False, 
                       **to_excel_kwargs):
    """
    Append a DataFrame [df] to existing Excel file [filename]
    into [sheet_name] Sheet.
    If [filename] doesn't exist, then this function will create it.

    Parameters:
      filename : File path or existing ExcelWriter
                 (Example: '/path/to/file.xlsx')
      df : dataframe to save to workbook
      sheet_name : Name of sheet which will contain DataFrame.
                   (default: 'Sheet1')
      startrow : upper left cell row to dump data frame.
                 Per default (startrow=None) calculate the last row
                 in the existing DF and write to the next row...
      truncate_sheet : truncate (remove and recreate) [sheet_name]
                       before writing DataFrame to Excel file
      to_excel_kwargs : arguments which will be passed to `DataFrame.to_excel()`
                        [can be dictionary]

    Returns: None
    """
    from openpyxl import load_workbook

    import pandas as pd

    # ignore [engine] parameter if it was passed
    if 'engine' in to_excel_kwargs:
        to_excel_kwargs.pop('engine')

    writer = pd.ExcelWriter(filename, engine='openpyxl')

    # Python 2.x: define [FileNotFoundError] exception if it doesn't exist 
    try:
        FileNotFoundError
    except NameError:
        FileNotFoundError = IOError


    try:
        # try to open an existing workbook
        writer.book = load_workbook(filename)

        # get the last row in the existing Excel sheet
        # if it was not specified explicitly
        if startrow is None and sheet_name in writer.book.sheetnames:
            startrow = writer.book[sheet_name].max_row

        # truncate sheet
        if truncate_sheet and sheet_name in writer.book.sheetnames:
            # index of [sheet_name] sheet
            idx = writer.book.sheetnames.index(sheet_name)
            # remove [sheet_name]
            writer.book.remove(writer.book.worksheets[idx])
            # create an empty sheet [sheet_name] using old index
            writer.book.create_sheet(sheet_name, idx)

        # copy existing sheets
        writer.sheets = {ws.title:ws for ws in writer.book.worksheets}
    except FileNotFoundError:
        # file does not exist yet, we will create it
        pass

    if startrow is None:
        startrow = 0

    # write out the new sheet
    df.to_excel(writer, sheet_name, startrow=startrow, **to_excel_kwargs)

    # save the workbook
    writer.save()

References
https://stackoverflow.com/questions/38074678/append-existing-excel-sheet-with-new-dataframe-using-python-pandas

Using iloc & loc to Select Rows and Columns in Pandas DataFrames

Selecting pandas data using “iloc”

The iloc indexer for Pandas Dataframe is used for integer-location based indexing / selection by position.

# Single selections using iloc and DataFrame
# Rows:
data.iloc[0] # first row of data frame (Aleshia Tomkiewicz) - Note a Series data type output.
data.iloc[1] # second row of data frame (Evan Zigomalas)
data.iloc[-1] # last row of data frame (Mi Richan)
# Columns:
data.iloc[:,0] # first column of data frame (first_name)
data.iloc[:,1] # second column of data frame (last_name)
data.iloc[:,-1] # last column of data frame (id)
# Multiple row and column selections using iloc and DataFrame
data.iloc[0:5] # first five rows of dataframe
data.iloc[:, 0:2] # first two columns of data frame with all rows
data.iloc[[0,3,6,24], [0,5,6]] # 1st, 4th, 7th, 25th row + 1st 6th 7th columns.
data.iloc[0:5, 5:8] # first 5 rows and 5th, 6th, 7th columns of data frame (county -> phone1).

Selecting pandas data using “loc”

The Pandas loc indexer can be used with DataFrames for two different use cases:

a.) Selecting rows by label/index
b.) Selecting rows with a boolean / conditional lookup

# Select rows with index values 'Andrade' and 'Veness', with all columns between 'city' and 'email'
data.loc[['Andrade', 'Veness'], 'city':'email']
# Select same rows, with just 'first_name', 'address' and 'city' columns
data.loc['Andrade':'Veness', ['first_name', 'address', 'city']]
 
# Change the index to be based on the 'id' column
data.set_index('id', inplace=True)
# select the row with 'id' = 487
data.loc[487]
# Select rows with first name Antonio, # and all columns between 'city' and 'email'
data.loc[data['first_name'] == 'Antonio', 'city':'email']
 
# Select rows where the email column ends with 'hotmail.com', include all columns
data.loc[data['email'].str.endswith("hotmail.com")]   
 
# Select rows with last_name equal to some values, all columns
data.loc[data['first_name'].isin(['France', 'Tyisha', 'Eric'])]   
       
# Select rows with first name Antonio AND hotmail email addresses
data.loc[data['email'].str.endswith("gmail.com") & (data['first_name'] == 'Antonio')] 
 
# select rows with id column between 100 and 200, and just return 'postal' and 'web' columns
data.loc[(data['id'] > 100) & (data['id'] <= 200), ['postal', 'web']] 
 
# A lambda function that yields True/False values can also be used.
# Select rows where the company name has 4 words in it.
data.loc[data['company_name'].apply(lambda x: len(x.split(' ')) == 4)] 
 
# Selections can be achieved outside of the main .loc for clarity:
# Form a separate variable with your selections:
idx = data['company_name'].apply(lambda x: len(x.split(' ')) == 4)
# Select only the True values in 'idx' and only the 3 columns specified:
data.loc[idx, ['email', 'first_name', 'company']]

References
https://www.shanelynn.ie/select-pandas-dataframe-rows-and-columns-using-iloc-loc-and-ix/
https://thispointer.com/select-rows-columns-by-name-or-index-in-dataframe-using-loc-iloc-python-pandas/

Change Data Type of columns in Pandas Dataframe

Method #1: Using DataFrame.astype()

# importing pandas as pd 
import pandas as pd 

# sample dataframe 
df = pd.DataFrame({ 
  'A': [1, 2, 3, 4, 5], 
  'B': ['a', 'b', 'c', 'd', 'e'], 
  'C': [1.1, '1.0', '1.3', 2, 5] }) 

# converting all columns to string type 
df = df.astype(str) 
print(df.dtypes) 
# importing pandas as pd 
import pandas as pd 

# sample dataframe 
df = pd.DataFrame({ 
  'A': [1, 2, 3, 4, 5], 
  'B': ['a', 'b', 'c', 'd', 'e'], 
  'C': [1.1, '1.0', '1.3', 2, 5] }) 

# using dictionary to convert specific columns 
convert_dict = {'A': int, 
        'C': float
      } 

df = df.astype(convert_dict) 
print(df.dtypes) 

Method #2: Using DataFrame.apply()

We can pass pandas.to_numeric, pandas.to_datetime and pandas.to_timedelta as argument to apply() function to change the datatype of one or more columns to numeric, datetime and timedelta respectively.

# importing pandas as pd 
import pandas as pd 

# sample dataframe 
df = pd.DataFrame({ 
  'A': [1, 2, 3, '4', '5'], 
  'B': ['a', 'b', 'c', 'd', 'e'], 
  'C': [1.1, '2.1', 3.0, '4.1', '5.1'] }) 

# using apply method 
df[['A', 'C']] = df[['A', 'C']].apply(pd.to_numeric) 
print(df.dtypes) 

Method #3: Using DataFrame.infer_objects()

# importing pandas as pd 
import pandas as pd 

# sample dataframe 
df = pd.DataFrame({ 
  'A': [1, 2, 3, 4, 5], 
  'B': ['a', 'b', 'c', 'd', 'e'], 
  'C': [1.1, 2.1, 3.0, 4.1, 5.1] 
  }, dtype ='object') 

# converting datatypes 
df = df.infer_objects() 
print(df.dtypes) 

References
https://www.geeksforgeeks.org/change-data-type-for-one-or-more-columns-in-pandas-dataframe/
https://towardsdatascience.com/my-pandas-cheat-sheet-b71437ab26f