Tuesday, August 6, 2024

Data Frames in R language

                  DATA FRAME



# Creating Data Frames


You can create a data frame using the data.frame() function. Here's an example:


# Create a data frame

df <- data.frame(

  Name = c("Alice", "Bob", "Charlie"),

  Age = c(25, 30, 35),

  Salary = c(50000, 60000, 70000)

)


# View the data frame

print(df)


Accessing Data

  • By column name: Use the $ operator or double brackets [[]]:

    df$Name          # Access the Name column

df[["Age"]]      # Access the Age column


  • By index: Use [row, column] indexing:

    df[1, ]          # Access the first row

df[, 2]          # Access the second column


Modifying Data Frames

  • Add a new column:

    df$Department <- c("HR", "Finance", "IT")


  • Add a new row:

    df <- rbind(df, data.frame(Name = "David", Age = 40, Salary = 80000, Department = "Marketing"))


  • Update existing data:

    df[1, "Salary"] <- 55000

Subsetting Data

  • Subset rows and columns:

    df_subset <- df[df$Age > 30, c("Name", "Salary")]


  • Use subset() function:

df_subset <- subset(df, Age > 30, select = c(Name, Salary))


Summary Statistics

  • Get a summary of the data frame:

    summary(df)


  • Check the structure:

    str(df)

Combining Data Frames

Combine rows:

df2 <- data.frame(Name = c("Eva", "Frank"), Age = c(45, 50), Salary = c(90000, 100000), Department = c("Legal", "Admin"))

df_combined <- rbind(df, df2)

Combine columns:

df3 <- data.frame(Experience = c(5, 10, 15, 20, 25))

df_combined <- cbind(df_combined, df3)


Useful Functions

  • head() and tail(): View the first or last few rows of the data frame.

    head(df)

      tail(df)


  • dim(): Get the dimensions (number of rows and columns).

    dim(df)


  • nrow() and ncol(): Get the number of rows and columns.

    nrow(df)

    ncol(df)


  • colnames() and rownames(): Get or set column and row names.

    colnames(df)

rownames(df)



DATA mining lab book 


# Create the data frame


 emp.data <- data.frame(

    employee_id = c(101:105),

   employee_name = c("ram", "sham", "neha", "siya", "Sumit"),

    sal = c(40000, 35000, 20000, 25000, 30000),

    starting_date = as.Date(c("2020-01-01", "2019-04-12", "2021-05-09", "2019-05-01", "2018-09-03")),

    stringsAsFactors = FALSE

 )


print(emp.data)


  employee_id employee_name   sal starting_date

1         101           ram 40000    2020-01-01

2         102          sham 35000    2019-04-12

3         103          neha 20000    2021-05-09

4         104          siya 25000    2019-05-01

5         105         Sumit 30000    2018-09-03



final <- data.frame(emp.data$employee_id, emp.data$sal)

 

 print(final)

  emp.data.employee_id emp.data.sal

1                  101        40000

2                  102        35000

3                  103        20000

4                  104        25000

5                  105        30000



final <- data.frame(emp.data$employee_id, emp.data$sal , emp.data$employee_name)


print(final)

  emp.data.employee_id emp.data.sal emp.data.employee_name

1                  101        40000                    ram

2                  102        35000                   sham

3                  103        20000                   neha

4                  104        25000                   siya

5                  105        30000                  Sumit



emp.data$employee_name     # Access the employee_name column

[1] "ram"   "sham"  "neha"  "siya"  "Sumit"

> emp.data[["sal"]]   # Access the sal column

[1] 40000 35000 20000 25000 30000

> emp.data[1, ] # Access the first row

  employee_id employee_name   sal starting_date

1         101           ram 40000    2020-01-01

> emp.data[,2 ]  # Access the second column (employee_name)

[1] "ram"   "sham"  "neha"  "siya"  "Sumit"


Modifying Data Frames

  • Add a new column:


 emp.data$department <- c("HR", "Finance", "IT", "Marketing", "Admin")


> emp.data


  employee_id employee_name   sal starting_date department

1         101           ram 40000    2020-01-01         HR

2         102          sham 35000    2019-04-12    Finance

3         103          neha 20000    2021-05-09         IT

4         104          siya 25000    2019-05-01  Marketing

5         105         Sumit 30000    2018-09-03      Admin



Add a new row:


> emp.data <- rbind(emp.data, data.frame(

    employee_id = 106,

   employee_name = "Geeta",

   sal = 32000,

    starting_date = as.Date("2022-07-01"),

    department = "Sales"

 ))

 

> emp.data


  employee_id employee_name   sal starting_date department

1         101           ram 40000    2020-01-01         HR

2         102          sham 35000    2019-04-12    Finance

3         103          neha 20000    2021-05-09         IT

4         104          siya 25000    2019-05-01  Marketing

5         105         Sumit 30000    2018-09-03      Admin

6         106         Geeta 32000    2022-07-01      Sales



Update existing data:

> emp.data[1, "sal"] <- 42000


> emp.data

  employee_id employee_name   sal starting_date department

1         101           ram 42000    2020-01-01         HR

2         102          sham 35000    2019-04-12    Finance

3         103          neha 20000    2021-05-09         IT

4         104          siya 25000    2019-05-01  Marketing

5         105         Sumit 30000    2018-09-03      Admin

6         106         Geeta 32000    2022-07-01      Sales




Summary Statistics

  • Get a summary of the data frame:

summary(emp.data)


  employee_id    employee_name           sal        starting_date       

 Min.   :101.0   Length:6           Min.   :20000   Min.   :2018-09-03  

 1st Qu.:102.2   Class :character   1st Qu.:26250   1st Qu.:2019-04-16  

 Median :103.5   Mode  :character   Median :31000   Median :2019-08-31  

 Mean   :103.5                      Mean   :30667   Mean   :2020-03-09  

 3rd Qu.:104.8                      3rd Qu.:34250   3rd Qu.:2021-01-05  

 Max.   :106.0                      Max.   :42000   Max.   :2022-07-01  

  department       

 Length:6          

 Class :character  

 Mode  :character


Check the structure:


 str(emp.data)


'data.frame': 6 obs. of  5 variables:

 $ employee_id  : num  101 102 103 104 105 106

 $ employee_name: chr  "ram" "sham" "neha" "siya" ...

 $ sal          : num  42000 35000 20000 25000 30000 32000

 $ starting_date: Date, format: "2020-01-01" "2019-04-12" ...

 $ department   : chr  "HR" "Finance" "IT" "Marketing" ...


Combining Data Frames

  • Combine rows:


> new_employees <- data.frame(

    employee_id = c(107, 108),

    employee_name = c("Amit", "Kiran"),

   sal = c(33000, 34000),

    starting_date = as.Date(c("2023-01-01", "2023-02-15")),

    department = c("Support", "R&D")

 )

 

 

> emp.data_combined <- rbind(emp.data, new_employees)


> emp.data_combined


  employee_id employee_name   sal starting_date department

1         101           ram 42000    2020-01-01         HR

2         102          sham 35000    2019-04-12    Finance

3         103          neha 20000    2021-05-09         IT

4         104          siya 25000    2019-05-01  Marketing

5         105         Sumit 30000    2018-09-03      Admin

6         106         Geeta 32000    2022-07-01      Sales

7         107          Amit 33000    2023-01-01    Support

8         108         Kiran 34000    2023-02-15        R&D


> emp.data

  employee_id employee_name   sal starting_date department

1         101           ram 42000    2020-01-01         HR

2         102          sham 35000    2019-04-12    Finance

3         103          neha 20000    2021-05-09         IT

4         104          siya 25000    2019-05-01  Marketing

5         105         Sumit 30000    2018-09-03      Admin

6         106         Geeta 32000    2022-07-01      Sales




# removing duplicate 


> companies <- data.frame(Shares = c("TCS","Reliance", "HDFC Bank", "Infosys", "Reliance"),      price= c(3200,1900,1500,3233,2234))

> companies

     Shares price

1       TCS  3200

2  Reliance  1900

3 HDFC Bank  1500

4   Infosys  3233

5  Reliance  2234

> cat(" after removing dublicate ","\n")

 after removing dublicate  


> companies[duplicated(companies)]

data frame with 0 columns and 5 rows

> companies

     Shares price

1       TCS  3200

2  Reliance  1900

3 HDFC Bank  1500

4   Infosys  3233

5  Reliance  2234

> unique(companies)

     Shares price

1       TCS  3200

2  Reliance  1900

3 HDFC Bank  1500

4   Infosys  3233

5  Reliance  2234


Encapsulates a set of instructions that can be reused with different inputs.


disp_table <- function(number) 

 { 

 for(t in 1:10)

 { 

  print(paste (number,'*',t,'=',number*t))

  }

}


  # Prompt the user to enter a number


> number <- as.integer(readline(prompt = "Please Enter a Number for the Table: "))

Please Enter a Number for the Table: 5


# Display the multiplication table


> disp_table(number)

5 * 1 = 5 

5 * 2 = 10 

5 * 3 = 15 

5 * 4 = 20 

5 * 5 = 25 

5 * 6 = 30 

5 * 7 = 35 

5 * 8 = 40 

5 * 9 = 45 

5 * 10 = 50


No comments:

Post a Comment