Session 45 - Inner Join in PySpark - Joining over one Column
Session 45 - Inner Join in PySpark - Joining over one Column
Problem Statement - Get the department name for all the employees
assigned to the departments. If there are employees which are not
assigned to any of the departments, do not return them in results.
Sample Data -
emp_data = [
(1,"Person1",1),
(2,"Person2",2),
(3,"Person3",1),
(4,"Person4",1),
(5,"Person5",6),
(6,"Person6",4),
(7,"Person6",2),
(8,"Person8",3)
]
department_data = [
(1,"IT"),
(2,"HR"),
(3,"DE"),
(4,"BE"),
(5,"FE")
]
Generic Structure for Join - df1 and df2
joined_df = df1.join(other=df2,on=[df1.emp_id==df2.emp_id],how="inner")
from pyspark.sql.types import *
from pyspark.sql import functions as F
emp_schema = StructType([
StructField("emp_id",IntegerType()),
StructField("emp_name",StringType()),
StructField("dept_id",IntegerType())
])
department_schema = StructType([
StructField("department_id",IntegerType()),
StructField("department_name",StringType())
])
emp_df = spark.createDataFrame(data=emp_data,schema=emp_schema)
dep_df = spark.createDataFrame(data=department_data,schema=department_schema)
joined_df = emp_df.join(other=dep_df,on=[emp_df.dept_id==dep_df.department_id],how="inner")\
.select(F.col("emp_id"),F.col("emp_name"),F.col("department_id"),F.col("department_name"))
joined_df.display()
joined_df1 = emp_df.join(dep_df,emp_df.dept_id==dep_df.department_id,"inner")\
.select(F.col("emp_id"),F.col("emp_name"),F.col("department_id"),F.col("department_name"))
joined_df1.display()
joined_df2 = emp_df.join(dep_df,emp_df.dept_id==dep_df.department_id)\
.select(F.col("emp_id"),F.col("emp_name"),F.col("department_id"),F.col("department_name"))
joined_df2.display()
emp_df.createOrReplaceTempView("employee")
dep_df.createOrReplaceTempView("department")
spark.sql("SELECT employee.emp_id,employee.emp_name,department.department_id,department.department_name FROM employee INNER JOIN department ON employee.dept_id=department.department_id").display()
#pyspark #apachespark #databricks #coding #learnpyspark #python #azuredatabrickswithpyspark #vlog #viralvideo