# Creating Data Frames
You can create a data frame using the data.frame() function. Here's an example:
# Create a data frame
df <- data.frame(
Name = c("Alice", "Bob", "Charlie"),
Age = c(25, 30, 35),
Salary = c(50000, 60000, 70000)
)
# View the data frame
print(df)
Accessing Data
By column name: Use the $ operator or double brackets [[]]:
df$Name # Access the Name column
df[["Age"]] # Access the Age column
By index: Use [row, column] indexing:
df[1, ] # Access the first row
df[, 2] # Access the second column
Modifying Data Frames
Add a new column:
df$Department <- c("HR", "Finance", "IT")
Add a new row:
df <- rbind(df, data.frame(Name = "David", Age = 40, Salary = 80000, Department = "Marketing"))
Update existing data:
df[1, "Salary"] <- 55000
Subsetting Data
Subset rows and columns:
df_subset <- df[df$Age > 30, c("Name", "Salary")]
Use subset() function:
df_subset <- subset(df, Age > 30, select = c(Name, Salary))
Summary Statistics
Get a summary of the data frame:
summary(df)
Check the structure:
str(df)
Combining Data Frames
Combine rows:
df2 <- data.frame(Name = c("Eva", "Frank"), Age = c(45, 50), Salary = c(90000, 100000), Department = c("Legal", "Admin"))
df_combined <- rbind(df, df2)
Combine columns:
df3 <- data.frame(Experience = c(5, 10, 15, 20, 25))
df_combined <- cbind(df_combined, df3)
Useful Functions
head() and tail(): View the first or last few rows of the data frame.
head(df)
tail(df)
dim(): Get the dimensions (number of rows and columns).
dim(df)
nrow() and ncol(): Get the number of rows and columns.
nrow(df)
ncol(df)
colnames() and rownames(): Get or set column and row names.
colnames(df)
rownames(df)
DATA mining lab book
# Create the data frame
emp.data <- data.frame(
employee_id = c(101:105),
employee_name = c("ram", "sham", "neha", "siya", "Sumit"),
sal = c(40000, 35000, 20000, 25000, 30000),
starting_date = as.Date(c("2020-01-01", "2019-04-12", "2021-05-09", "2019-05-01", "2018-09-03")),
stringsAsFactors = FALSE
)
print(emp.data)
employee_id employee_name sal starting_date
1 101 ram 40000 2020-01-01
2 102 sham 35000 2019-04-12
3 103 neha 20000 2021-05-09
4 104 siya 25000 2019-05-01
5 105 Sumit 30000 2018-09-03
final <- data.frame(emp.data$employee_id, emp.data$sal)
print(final)
emp.data.employee_id emp.data.sal
1 101 40000
2 102 35000
3 103 20000
4 104 25000
5 105 30000
final <- data.frame(emp.data$employee_id, emp.data$sal , emp.data$employee_name)
print(final)
emp.data.employee_id emp.data.sal emp.data.employee_name
1 101 40000 ram
2 102 35000 sham
3 103 20000 neha
4 104 25000 siya
5 105 30000 Sumit
emp.data$employee_name # Access the employee_name column
[1] "ram" "sham" "neha" "siya" "Sumit"
>
> emp.data[["sal"]] # Access the sal column
[1] 40000 35000 20000 25000 30000
>
> emp.data[1, ] # Access the first row
employee_id employee_name sal starting_date
1 101 ram 40000 2020-01-01
> emp.data[,2 ] # Access the second column (employee_name)
[1] "ram" "sham" "neha" "siya" "Sumit"
Modifying Data Frames
Add a new column:
emp.data$department <- c("HR", "Finance", "IT", "Marketing", "Admin")
> emp.data
employee_id employee_name sal starting_date department
1 101 ram 40000 2020-01-01 HR
2 102 sham 35000 2019-04-12 Finance
3 103 neha 20000 2021-05-09 IT
4 104 siya 25000 2019-05-01 Marketing
5 105 Sumit 30000 2018-09-03 Admin
Add a new row:
> emp.data <- rbind(emp.data, data.frame(
employee_id = 106,
employee_name = "Geeta",
sal = 32000,
starting_date = as.Date("2022-07-01"),
department = "Sales"
))
> emp.data
employee_id employee_name sal starting_date department
1 101 ram 40000 2020-01-01 HR
2 102 sham 35000 2019-04-12 Finance
3 103 neha 20000 2021-05-09 IT
4 104 siya 25000 2019-05-01 Marketing
5 105 Sumit 30000 2018-09-03 Admin
6 106 Geeta 32000 2022-07-01 Sales
Update existing data:
> emp.data[1, "sal"] <- 42000
> emp.data
employee_id employee_name sal starting_date department
1 101 ram 42000 2020-01-01 HR
2 102 sham 35000 2019-04-12 Finance
3 103 neha 20000 2021-05-09 IT
4 104 siya 25000 2019-05-01 Marketing
5 105 Sumit 30000 2018-09-03 Admin
6 106 Geeta 32000 2022-07-01 Sales
Summary Statistics
Get a summary of the data frame:
summary(emp.data)
employee_id employee_name sal starting_date
Min. :101.0 Length:6 Min. :20000 Min. :2018-09-03
1st Qu.:102.2 Class :character 1st Qu.:26250 1st Qu.:2019-04-16
Median :103.5 Mode :character Median :31000 Median :2019-08-31
Mean :103.5 Mean :30667 Mean :2020-03-09
3rd Qu.:104.8 3rd Qu.:34250 3rd Qu.:2021-01-05
Max. :106.0 Max. :42000 Max. :2022-07-01
department
Length:6
Class :character
Mode :character
Check the structure:
str(emp.data)
'data.frame': 6 obs. of 5 variables:
$ employee_id : num 101 102 103 104 105 106
$ employee_name: chr "ram" "sham" "neha" "siya" ...
$ sal : num 42000 35000 20000 25000 30000 32000
$ starting_date: Date, format: "2020-01-01" "2019-04-12" ...
$ department : chr "HR" "Finance" "IT" "Marketing" ...
Combining Data Frames
Combine rows:
> new_employees <- data.frame(
employee_id = c(107, 108),
employee_name = c("Amit", "Kiran"),
sal = c(33000, 34000),
starting_date = as.Date(c("2023-01-01", "2023-02-15")),
department = c("Support", "R&D")
)
> emp.data_combined <- rbind(emp.data, new_employees)
> emp.data_combined
employee_id employee_name sal starting_date department
1 101 ram 42000 2020-01-01 HR
2 102 sham 35000 2019-04-12 Finance
3 103 neha 20000 2021-05-09 IT
4 104 siya 25000 2019-05-01 Marketing
5 105 Sumit 30000 2018-09-03 Admin
6 106 Geeta 32000 2022-07-01 Sales
7 107 Amit 33000 2023-01-01 Support
8 108 Kiran 34000 2023-02-15 R&D
> emp.data
employee_id employee_name sal starting_date department
1 101 ram 42000 2020-01-01 HR
2 102 sham 35000 2019-04-12 Finance
3 103 neha 20000 2021-05-09 IT
4 104 siya 25000 2019-05-01 Marketing
5 105 Sumit 30000 2018-09-03 Admin
6 106 Geeta 32000 2022-07-01 Sales
# removing duplicate
> companies <- data.frame(Shares = c("TCS","Reliance", "HDFC Bank", "Infosys", "Reliance"), price= c(3200,1900,1500,3233,2234))
>
> companies
Shares price
1 TCS 3200
2 Reliance 1900
3 HDFC Bank 1500
4 Infosys 3233
5 Reliance 2234
>
>
> cat(" after removing dublicate ","\n")
after removing dublicate
> companies[duplicated(companies)]
data frame with 0 columns and 5 rows
>
> companies
Shares price
1 TCS 3200
2 Reliance 1900
3 HDFC Bank 1500
4 Infosys 3233
5 Reliance 2234
>
> unique(companies)
Shares price
1 TCS 3200
2 Reliance 1900
3 HDFC Bank 1500
4 Infosys 3233
5 Reliance 2234
Encapsulates a set of instructions that can be reused with different inputs.
disp_table <- function(number)
{
for(t in 1:10)
{
print(paste (number,'*',t,'=',number*t))
}
}
# Prompt the user to enter a number
> number <- as.integer(readline(prompt = "Please Enter a Number for the Table: "))
Please Enter a Number for the Table: 5
# Display the multiplication table
> disp_table(number)
5 * 1 = 5
5 * 2 = 10
5 * 3 = 15
5 * 4 = 20
5 * 5 = 25
5 * 6 = 30
5 * 7 = 35
5 * 8 = 40
5 * 9 = 45
5 * 10 = 50
No comments:
Post a Comment