Get value of a cell at position (row,column) with openpyxl
from openpyxl import load_workbook wb = load_workbook(file_name, read_only=True) test_sheet = wb["Test"] print(test_sheet.cell(None, 1, 1).value)
from openpyxl import load_workbook wb = load_workbook(file_name, read_only=True) test_sheet = wb["Test"] print(test_sheet.cell(None, 1, 1).value)
Single condition filtering
# importing pandas package import pandas as pd # making data frame from csv file data = pd.read_csv("employees.csv") # replacing blank spaces with '_' data.columns =[column.replace(" ", "_") for column in data.columns] # filtering with query method data.query('Senior_Management == True', inplace = True) # display data
Multiple condition filtering
# importing pandas package import pandas as pd # making data frame from csv file data = pd.read_csv("employees.csv") # replacing blank spaces with '_' data.columns =[column.replace(" ", "_") for column in data.columns] # filtering with query method data.query('Senior_Management == True and Gender =="Male" and Team =="Marketing" and First_Name =="Johnny"', inplace = True) # display data
References
https://www.geeksforgeeks.org/python-filtering-data-with-pandas-query-method/
len(gapminder['country'].unique().tolist())
set(df['region'].values.tolist())
# Create a list of unique values by turning the # pandas column into a set list(set(df.trucks))
# Create a list of unique values in df.trucks list(df['trucks'].unique())
# Import pandas package import pandas as pd # create a dictionary with five fields each data = { 'A':['A1', 'A2', 'A3', 'A4', 'A5'], 'B':['B1', 'B2', 'B3', 'B4', 'B4'], 'C':['C1', 'C2', 'C3', 'C3', 'C3'], 'D':['D1', 'D2', 'D2', 'D2', 'D2'], 'E':['E1', 'E1', 'E1', 'E1', 'E1'] } # Convert the dictionary into DataFrame df = pd.DataFrame(data) # Get the unique values of 'B' column df.B.unique()
# Import pandas package import pandas as pd # create a dictionary with five fields each data = { 'A':['A1', 'A2', 'A3', 'A4', 'A5'], 'B':['B1', 'B2', 'B3', 'B4', 'B4'], 'C':['C1', 'C2', 'C3', 'C3', 'C3'], 'D':['D1', 'D2', 'D2', 'D2', 'D2'], 'E':['E1', 'E1', 'E1', 'E1', 'E1'] } # Convert the dictionary into DataFrame df = pd.DataFrame(data) # Get number of unique values in column 'C' df.C.nunique(dropna = True)
References
https://pythonprogramming.net/graph-visualization-python3-pandas-data-analysis/
https://www.geeksforgeeks.org/get-unique-values-from-a-column-in-pandas-dataframe/
https://chrisalbon.com/python/data_wrangling/pandas_find_unique_values/
https://cmdlinetips.com/2018/01/how-to-get-unique-values-from-a-column-in-pandas-data-frame/
DataFrame.sort_values(by, axis=0, ascending=True, inplace=False, kind='quicksort', na_position='last')
Sort Dataframe rows based on a single column
# Sort the rows of dataframe by column 'Name' dfObj = dfObj.sort_values(by ='Name' ) print("Contents of Sorted Dataframe based on a single column 'Name' : ") print(dfObj)
Sort Dataframe rows based on a multiple columns
dfObj = dfObj.sort_values(by =['Name', 'Marks']) print("Contents of a Sorted Dataframe based on multiple columns 'Name' & 'Marks' : ") print(dfObj)
Sort Dataframe rows based on columns in Descending Order
# Sort the rows of dataframe by column 'Name' in descending order dfObj = dfObj.sort_values(by ='Name' , ascending=False) print("Contents of Sorted Dataframe based on a column 'Name' in Descending Order : ") print(dfObj)
Sort Dataframe rows based on a column in Place
# Sort the rows of dataframe by column 'Name' inplace dfObj.sort_values(by='Name' , inplace=True) print("Contents of Sorted Dataframe based on a single column 'Name' inplace: ") print(dfObj)
Sort columns of a Dataframe based on a single row
dfObj = dfObj.sort_values(by ='b', axis=1) print("Contents of Sorted Dataframe based on a single row index label 'b' ") print(dfObj)
Sort columns of a Dataframe in Descending Order based on a single row
dfObj = dfObj.sort_values(by ='b', axis=1) print("Contents of Sorted Dataframe based on a single row index label 'b' ") print(dfObj)
Sort columns of a Dataframe based on a multiple rows
dfObj = dfObj.sort_values(by ='b', axis=1) print("Contents of Sorted Dataframe based on a single row index label 'b' ") print(dfObj)
Complete example is as follows:
import pandas as pd def main(): # List of Tuples students = [ ('Jack', 34, 'Sydney') , ('Riti', 41, 'Delhi' ) , ('Aadi', 16, 'New York') , ('Riti', 22, 'Delhi' ) , ('Riti', 35, 'Delhi' ) , ('Riti', 40, 'Mumbai' ) ] # Create a DataFrame object dfObj = pd.DataFrame(students, columns=['Name', 'Marks', 'City'], index=['b', 'a', 'f', 'e', 'd', 'c']) print("Original Dataframe : ") print(dfObj) print('**** Sort Dataframe rows based on a single column ****') # Sort the rows of dataframe by column 'Name' dfObj = dfObj.sort_values(by ='Name' ) print("Contents of Sorted Dataframe based on a single column 'Name' : ") print(dfObj) print('**** Sort Dataframe rows based on a multiple columns ****') dfObj = dfObj.sort_values(by =['Name', 'Marks']) print("Contents of a Sorted Dataframe based on multiple columns 'Name' & 'Marks' : ") print(dfObj) print('**** Sort Dataframe rows based on a single column in Descending Order ****') # Sort the rows of dataframe by column 'Name' in descending order dfObj = dfObj.sort_values(by ='Name' , ascending=False) print("Contents of Sorted Dataframe based on a column 'Name' in Descending Order : ") print(dfObj) print('**** Sort Dataframe rows based on a single column in place ****') # Sort the rows of dataframe by column 'Name' inplace dfObj.sort_values(by='Name' , inplace=True) print("Contents of Sorted Dataframe based on a single column 'Name' inplace: ") print(dfObj) print('******** Sort columns of Dataframe based on a single or multiple rows ********') # List of Tuples matrix = [(222, 16, 23), (333, 31, 11), (444, 34, 11), ] # Create a DataFrame object of 3X3 Matrix dfObj = pd.DataFrame(matrix, index=list('abc')) print("Original Dataframe: ") print(dfObj) # Sort columns of a dataframe based on a single row with index label 'b' dfObj = dfObj.sort_values(by ='b', axis=1) print("Contents of Sorted Dataframe based on a single row index label 'b' ") print(dfObj) # Sort columns of a dataframe in descending order based on a single row with index label 'b' dfObj = dfObj.sort_values(by='b', axis=1, ascending=False) print("Contents of Sorted Dataframe in descending order based on a single row index label 'b' ") print(dfObj) # Sort columns of a dataframe based on a multiple row with index labels 'b' & 'c' dfObj = dfObj.sort_values(by =['b' , 'c' ], axis=1) print("Contents of Sorted Dataframe based on multiple rows index label 'b' & 'c' ") print(dfObj) if __name__ == '__main__': main()
References
https://thispointer.com/pandas-sort-rows-or-columns-in-dataframe-based-on-values-using-dataframe-sort_values/
https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.sort_values.html
def append_df_to_excel(filename, df, sheet_name='Sheet1', startrow=None, truncate_sheet=False, **to_excel_kwargs): """ Append a DataFrame [df] to existing Excel file [filename] into [sheet_name] Sheet. If [filename] doesn't exist, then this function will create it. Parameters: filename : File path or existing ExcelWriter (Example: '/path/to/file.xlsx') df : dataframe to save to workbook sheet_name : Name of sheet which will contain DataFrame. (default: 'Sheet1') startrow : upper left cell row to dump data frame. Per default (startrow=None) calculate the last row in the existing DF and write to the next row... truncate_sheet : truncate (remove and recreate) [sheet_name] before writing DataFrame to Excel file to_excel_kwargs : arguments which will be passed to `DataFrame.to_excel()` [can be dictionary] Returns: None """ from openpyxl import load_workbook import pandas as pd # ignore [engine] parameter if it was passed if 'engine' in to_excel_kwargs: to_excel_kwargs.pop('engine') writer = pd.ExcelWriter(filename, engine='openpyxl') # Python 2.x: define [FileNotFoundError] exception if it doesn't exist try: FileNotFoundError except NameError: FileNotFoundError = IOError try: # try to open an existing workbook writer.book = load_workbook(filename) # get the last row in the existing Excel sheet # if it was not specified explicitly if startrow is None and sheet_name in writer.book.sheetnames: startrow = writer.book[sheet_name].max_row # truncate sheet if truncate_sheet and sheet_name in writer.book.sheetnames: # index of [sheet_name] sheet idx = writer.book.sheetnames.index(sheet_name) # remove [sheet_name] writer.book.remove(writer.book.worksheets[idx]) # create an empty sheet [sheet_name] using old index writer.book.create_sheet(sheet_name, idx) # copy existing sheets writer.sheets = {ws.title:ws for ws in writer.book.worksheets} except FileNotFoundError: # file does not exist yet, we will create it pass if startrow is None: startrow = 0 # write out the new sheet df.to_excel(writer, sheet_name, startrow=startrow, **to_excel_kwargs) # save the workbook writer.save()
Selecting pandas data using “iloc”
The iloc indexer for Pandas Dataframe is used for integer-location based indexing / selection by position.
# Single selections using iloc and DataFrame # Rows: data.iloc[0] # first row of data frame (Aleshia Tomkiewicz) - Note a Series data type output. data.iloc[1] # second row of data frame (Evan Zigomalas) data.iloc[-1] # last row of data frame (Mi Richan) # Columns: data.iloc[:,0] # first column of data frame (first_name) data.iloc[:,1] # second column of data frame (last_name) data.iloc[:,-1] # last column of data frame (id)
# Multiple row and column selections using iloc and DataFrame data.iloc[0:5] # first five rows of dataframe data.iloc[:, 0:2] # first two columns of data frame with all rows data.iloc[[0,3,6,24], [0,5,6]] # 1st, 4th, 7th, 25th row + 1st 6th 7th columns. data.iloc[0:5, 5:8] # first 5 rows and 5th, 6th, 7th columns of data frame (county -> phone1).
Selecting pandas data using “loc”
The Pandas loc indexer can be used with DataFrames for two different use cases:
a.) Selecting rows by label/index
b.) Selecting rows with a boolean / conditional lookup
# Select rows with index values 'Andrade' and 'Veness', with all columns between 'city' and 'email' data.loc[['Andrade', 'Veness'], 'city':'email'] # Select same rows, with just 'first_name', 'address' and 'city' columns data.loc['Andrade':'Veness', ['first_name', 'address', 'city']] # Change the index to be based on the 'id' column data.set_index('id', inplace=True) # select the row with 'id' = 487 data.loc[487]
# Select rows with first name Antonio, # and all columns between 'city' and 'email' data.loc[data['first_name'] == 'Antonio', 'city':'email'] # Select rows where the email column ends with 'hotmail.com', include all columns data.loc[data['email'].str.endswith("hotmail.com")] # Select rows with last_name equal to some values, all columns data.loc[data['first_name'].isin(['France', 'Tyisha', 'Eric'])] # Select rows with first name Antonio AND hotmail email addresses data.loc[data['email'].str.endswith("gmail.com") & (data['first_name'] == 'Antonio')] # select rows with id column between 100 and 200, and just return 'postal' and 'web' columns data.loc[(data['id'] > 100) & (data['id'] <= 200), ['postal', 'web']] # A lambda function that yields True/False values can also be used. # Select rows where the company name has 4 words in it. data.loc[data['company_name'].apply(lambda x: len(x.split(' ')) == 4)] # Selections can be achieved outside of the main .loc for clarity: # Form a separate variable with your selections: idx = data['company_name'].apply(lambda x: len(x.split(' ')) == 4) # Select only the True values in 'idx' and only the 3 columns specified: data.loc[idx, ['email', 'first_name', 'company']]
References
https://www.shanelynn.ie/select-pandas-dataframe-rows-and-columns-using-iloc-loc-and-ix/
https://thispointer.com/select-rows-columns-by-name-or-index-in-dataframe-using-loc-iloc-python-pandas/
Method #1: Using DataFrame.astype()
# importing pandas as pd import pandas as pd # sample dataframe df = pd.DataFrame({ 'A': [1, 2, 3, 4, 5], 'B': ['a', 'b', 'c', 'd', 'e'], 'C': [1.1, '1.0', '1.3', 2, 5] }) # converting all columns to string type df = df.astype(str) print(df.dtypes)
# importing pandas as pd import pandas as pd # sample dataframe df = pd.DataFrame({ 'A': [1, 2, 3, 4, 5], 'B': ['a', 'b', 'c', 'd', 'e'], 'C': [1.1, '1.0', '1.3', 2, 5] }) # using dictionary to convert specific columns convert_dict = {'A': int, 'C': float } df = df.astype(convert_dict) print(df.dtypes)
Method #2: Using DataFrame.apply()
We can pass pandas.to_numeric
, pandas.to_datetime
and pandas.to_timedelta
as argument to apply() function to change the datatype of one or more columns to numeric, datetime and timedelta respectively.
# importing pandas as pd import pandas as pd # sample dataframe df = pd.DataFrame({ 'A': [1, 2, 3, '4', '5'], 'B': ['a', 'b', 'c', 'd', 'e'], 'C': [1.1, '2.1', 3.0, '4.1', '5.1'] }) # using apply method df[['A', 'C']] = df[['A', 'C']].apply(pd.to_numeric) print(df.dtypes)
Method #3: Using DataFrame.infer_objects()
# importing pandas as pd import pandas as pd # sample dataframe df = pd.DataFrame({ 'A': [1, 2, 3, 4, 5], 'B': ['a', 'b', 'c', 'd', 'e'], 'C': [1.1, 2.1, 3.0, 4.1, 5.1] }, dtype ='object') # converting datatypes df = df.infer_objects() print(df.dtypes)
References
https://www.geeksforgeeks.org/change-data-type-for-one-or-more-columns-in-pandas-dataframe/
https://towardsdatascience.com/my-pandas-cheat-sheet-b71437ab26f
import pandas as pd df: pd.DataFrame = pd.read_csv("avocado.csv") df_cp = df.copy()
References
https://pythonprogramming.net/introduction-python3-pandas-data-analysis/
import pandas as pd df: pd.DataFrame = pd.read_csv("avocado.csv") df.set_index("Date", inplace=True) df.sort_index(inplace=True) print(df.index)
References
https://pythonprogramming.net/graph-visualization-python3-pandas-data-analysis/
import pandas as pd df: pd.DataFrame = pd.read_csv("avocado.csv") # index of dataframe print(df.index) # set index in dataframe df.set_index("Date", inplace=True) # or set index this ways # df = df.set_index("Date") # print index print(df.index)
References
https://pythonprogramming.net/introduction-python3-pandas-data-analysis/