Data Cleaning in Python Pandas¶
In [1]:
import pandas as pd
In [106]:
df = pd.read_csv(r"C:\Users\binarystudy\Projects\students_data.csv")
In [107]:
df
Out[107]:
| student_id | name | age | gender | grade | math_score | english_score | science_score | enrolled_date | remarks | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 100 | jane smith | 16.0 | female | 11 | 75.0 | NaN | 66 | 2022-06-10 | excellent |
| 1 | 101 | John Doe | 16.0 | Male | 10th | 74.0 | 95 | 94 | 10-06-2022 | GOOD |
| 2 | 102 | Chris P. | NaN | MALE | 10 | NaN | missing | 69 | 06/12/2022 | needs improvement |
| 3 | 103 | jane smith | 16.0 | FEMALE | 10 | NaN | missing | 62 | 10-06-2022 | average |
| 4 | 104 | Sara O'Neil | 16.0 | male | 11 | NaN | 96 | 64 | 2022-06-10 | GOOD |
| 5 | 105 | Mike O’Reilly | 16.0 | Female | 10 | NaN | NaN | 83 | 06/12/2022 | needs improvement |
| 6 | 106 | ali Khan | 17.0 | female | 11 | 64.0 | NaN | 75 | 06/12/2022 | Good |
| 7 | 107 | Sara O'Neil | 17.0 | female | 12 | NaN | 63 | 62 | 2022/06/11 | excellent |
| 8 | 108 | Mike O’Reilly | 16.0 | Female | 12 | 80.0 | missing | 89 | 06/12/2022 | poor |
| 9 | 109 | Robert Brown | 17.0 | female | 12 | NaN | missing | 97 | 10-06-2022 | needs improvement |
| 10 | 110 | lucy gray | 17.0 | male | 11th | 65.0 | 67 | 100 | 06/12/2022 | excellent |
| 11 | 111 | Simran Singh | 16.0 | FEMALE | 11th | NaN | missing | 95 | 2022-06-10 | average |
| 12 | 112 | Patel R. | 17.0 | female | 11 | NaN | 87 | 89 | 2022-06-10 | poor |
| 13 | 113 | Patel R. | 17.0 | male | 10 | NaN | NaN | 98 | 06/12/2022 | Average |
| 14 | 114 | Ali Khan | 17.0 | male | 12 | NaN | 91 | 67 | 2022/06/11 | poor |
| 15 | 115 | Lucy gray | 16.0 | Female | 12 | 65.0 | 91 | 94 | 06/12/2022 | Average |
| 16 | 116 | Chris P. | NaN | Female | 11 | NaN | NaN | 72 | 2022/06/11 | excellent |
| 17 | 117 | Ali Khan | 18.0 | male | 11th | 67.0 | 74 | 81 | 2022-06-10 | GOOD |
| 18 | 118 | Simran Singh | 17.0 | Male | 10 | 100.0 | 74 | 62 | 2022/06/11 | average |
| 19 | 119 | Patel R. | 17.0 | MALE | 11th | 73.0 | NaN | 90 | 06/12/2022 | Good |
| 20 | 120 | Sara O'Neil | 17.0 | Male | 10 | NaN | missing | 89 | 2022-06-10 | average |
| 21 | 121 | John Doe | 18.0 | female | 11 | 66.0 | 72 | 94 | 10-06-2022 | Average |
| 22 | 122 | Sara O'Neil | 17.0 | MALE | 11th | 75.0 | NaN | 66 | 2022-06-10 | good student |
| 23 | 123 | jane smith | 17.0 | Female | 11th | NaN | missing | 63 | 06/12/2022 | excellent |
| 24 | 124 | John Doe | 18.0 | MALE | 11th | NaN | NaN | 91 | 06/12/2022 | GOOD |
| 25 | 125 | Mike O’Reilly | 17.0 | Male | 12 | 94.0 | 80 | 63 | 10-06-2022 | Average |
| 26 | 126 | John Doe | NaN | Male | 10th | 64.0 | missing | 67 | 06/12/2022 | good student |
| 27 | 127 | Simran Singh | 16.0 | FEMALE | 12 | NaN | 76 | 80 | 06/12/2022 | poor |
| 28 | 128 | Sara O'Neil | 17.0 | female | 11 | NaN | 64 | 89 | 2022-06-10 | average |
| 29 | 129 | Patel R. | 17.0 | male | 11 | NaN | 64 | 83 | 2022/06/11 | Average |
| 30 | 129 | Patel R. | 17.0 | male | 11 | NaN | 64 | 83 | 2022/06/11 | Average |
In [108]:
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 31 entries, 0 to 30 Data columns (total 10 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 student_id 31 non-null int64 1 name 31 non-null object 2 age 28 non-null float64 3 gender 31 non-null object 4 grade 31 non-null object 5 math_score 13 non-null float64 6 english_score 23 non-null object 7 science_score 31 non-null int64 8 enrolled_date 31 non-null object 9 remarks 31 non-null object dtypes: float64(2), int64(2), object(6) memory usage: 2.6+ KB
In [109]:
df.duplicated()
Out[109]:
0 False 1 False 2 False 3 False 4 False 5 False 6 False 7 False 8 False 9 False 10 False 11 False 12 False 13 False 14 False 15 False 16 False 17 False 18 False 19 False 20 False 21 False 22 False 23 False 24 False 25 False 26 False 27 False 28 False 29 False 30 True dtype: bool
In [110]:
df = df.drop_duplicates()
In [111]:
df
Out[111]:
| student_id | name | age | gender | grade | math_score | english_score | science_score | enrolled_date | remarks | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 100 | jane smith | 16.0 | female | 11 | 75.0 | NaN | 66 | 2022-06-10 | excellent |
| 1 | 101 | John Doe | 16.0 | Male | 10th | 74.0 | 95 | 94 | 10-06-2022 | GOOD |
| 2 | 102 | Chris P. | NaN | MALE | 10 | NaN | missing | 69 | 06/12/2022 | needs improvement |
| 3 | 103 | jane smith | 16.0 | FEMALE | 10 | NaN | missing | 62 | 10-06-2022 | average |
| 4 | 104 | Sara O'Neil | 16.0 | male | 11 | NaN | 96 | 64 | 2022-06-10 | GOOD |
| 5 | 105 | Mike O’Reilly | 16.0 | Female | 10 | NaN | NaN | 83 | 06/12/2022 | needs improvement |
| 6 | 106 | ali Khan | 17.0 | female | 11 | 64.0 | NaN | 75 | 06/12/2022 | Good |
| 7 | 107 | Sara O'Neil | 17.0 | female | 12 | NaN | 63 | 62 | 2022/06/11 | excellent |
| 8 | 108 | Mike O’Reilly | 16.0 | Female | 12 | 80.0 | missing | 89 | 06/12/2022 | poor |
| 9 | 109 | Robert Brown | 17.0 | female | 12 | NaN | missing | 97 | 10-06-2022 | needs improvement |
| 10 | 110 | lucy gray | 17.0 | male | 11th | 65.0 | 67 | 100 | 06/12/2022 | excellent |
| 11 | 111 | Simran Singh | 16.0 | FEMALE | 11th | NaN | missing | 95 | 2022-06-10 | average |
| 12 | 112 | Patel R. | 17.0 | female | 11 | NaN | 87 | 89 | 2022-06-10 | poor |
| 13 | 113 | Patel R. | 17.0 | male | 10 | NaN | NaN | 98 | 06/12/2022 | Average |
| 14 | 114 | Ali Khan | 17.0 | male | 12 | NaN | 91 | 67 | 2022/06/11 | poor |
| 15 | 115 | Lucy gray | 16.0 | Female | 12 | 65.0 | 91 | 94 | 06/12/2022 | Average |
| 16 | 116 | Chris P. | NaN | Female | 11 | NaN | NaN | 72 | 2022/06/11 | excellent |
| 17 | 117 | Ali Khan | 18.0 | male | 11th | 67.0 | 74 | 81 | 2022-06-10 | GOOD |
| 18 | 118 | Simran Singh | 17.0 | Male | 10 | 100.0 | 74 | 62 | 2022/06/11 | average |
| 19 | 119 | Patel R. | 17.0 | MALE | 11th | 73.0 | NaN | 90 | 06/12/2022 | Good |
| 20 | 120 | Sara O'Neil | 17.0 | Male | 10 | NaN | missing | 89 | 2022-06-10 | average |
| 21 | 121 | John Doe | 18.0 | female | 11 | 66.0 | 72 | 94 | 10-06-2022 | Average |
| 22 | 122 | Sara O'Neil | 17.0 | MALE | 11th | 75.0 | NaN | 66 | 2022-06-10 | good student |
| 23 | 123 | jane smith | 17.0 | Female | 11th | NaN | missing | 63 | 06/12/2022 | excellent |
| 24 | 124 | John Doe | 18.0 | MALE | 11th | NaN | NaN | 91 | 06/12/2022 | GOOD |
| 25 | 125 | Mike O’Reilly | 17.0 | Male | 12 | 94.0 | 80 | 63 | 10-06-2022 | Average |
| 26 | 126 | John Doe | NaN | Male | 10th | 64.0 | missing | 67 | 06/12/2022 | good student |
| 27 | 127 | Simran Singh | 16.0 | FEMALE | 12 | NaN | 76 | 80 | 06/12/2022 | poor |
| 28 | 128 | Sara O'Neil | 17.0 | female | 11 | NaN | 64 | 89 | 2022-06-10 | average |
| 29 | 129 | Patel R. | 17.0 | male | 11 | NaN | 64 | 83 | 2022/06/11 | Average |
In [112]:
df.loc[:, "name"] = df["name"].str.lower().str.title()
In [113]:
df
Out[113]:
| student_id | name | age | gender | grade | math_score | english_score | science_score | enrolled_date | remarks | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 100 | Jane Smith | 16.0 | female | 11 | 75.0 | NaN | 66 | 2022-06-10 | excellent |
| 1 | 101 | John Doe | 16.0 | Male | 10th | 74.0 | 95 | 94 | 10-06-2022 | GOOD |
| 2 | 102 | Chris P. | NaN | MALE | 10 | NaN | missing | 69 | 06/12/2022 | needs improvement |
| 3 | 103 | Jane Smith | 16.0 | FEMALE | 10 | NaN | missing | 62 | 10-06-2022 | average |
| 4 | 104 | Sara O'Neil | 16.0 | male | 11 | NaN | 96 | 64 | 2022-06-10 | GOOD |
| 5 | 105 | Mike O’Reilly | 16.0 | Female | 10 | NaN | NaN | 83 | 06/12/2022 | needs improvement |
| 6 | 106 | Ali Khan | 17.0 | female | 11 | 64.0 | NaN | 75 | 06/12/2022 | Good |
| 7 | 107 | Sara O'Neil | 17.0 | female | 12 | NaN | 63 | 62 | 2022/06/11 | excellent |
| 8 | 108 | Mike O’Reilly | 16.0 | Female | 12 | 80.0 | missing | 89 | 06/12/2022 | poor |
| 9 | 109 | Robert Brown | 17.0 | female | 12 | NaN | missing | 97 | 10-06-2022 | needs improvement |
| 10 | 110 | Lucy Gray | 17.0 | male | 11th | 65.0 | 67 | 100 | 06/12/2022 | excellent |
| 11 | 111 | Simran Singh | 16.0 | FEMALE | 11th | NaN | missing | 95 | 2022-06-10 | average |
| 12 | 112 | Patel R. | 17.0 | female | 11 | NaN | 87 | 89 | 2022-06-10 | poor |
| 13 | 113 | Patel R. | 17.0 | male | 10 | NaN | NaN | 98 | 06/12/2022 | Average |
| 14 | 114 | Ali Khan | 17.0 | male | 12 | NaN | 91 | 67 | 2022/06/11 | poor |
| 15 | 115 | Lucy Gray | 16.0 | Female | 12 | 65.0 | 91 | 94 | 06/12/2022 | Average |
| 16 | 116 | Chris P. | NaN | Female | 11 | NaN | NaN | 72 | 2022/06/11 | excellent |
| 17 | 117 | Ali Khan | 18.0 | male | 11th | 67.0 | 74 | 81 | 2022-06-10 | GOOD |
| 18 | 118 | Simran Singh | 17.0 | Male | 10 | 100.0 | 74 | 62 | 2022/06/11 | average |
| 19 | 119 | Patel R. | 17.0 | MALE | 11th | 73.0 | NaN | 90 | 06/12/2022 | Good |
| 20 | 120 | Sara O'Neil | 17.0 | Male | 10 | NaN | missing | 89 | 2022-06-10 | average |
| 21 | 121 | John Doe | 18.0 | female | 11 | 66.0 | 72 | 94 | 10-06-2022 | Average |
| 22 | 122 | Sara O'Neil | 17.0 | MALE | 11th | 75.0 | NaN | 66 | 2022-06-10 | good student |
| 23 | 123 | Jane Smith | 17.0 | Female | 11th | NaN | missing | 63 | 06/12/2022 | excellent |
| 24 | 124 | John Doe | 18.0 | MALE | 11th | NaN | NaN | 91 | 06/12/2022 | GOOD |
| 25 | 125 | Mike O’Reilly | 17.0 | Male | 12 | 94.0 | 80 | 63 | 10-06-2022 | Average |
| 26 | 126 | John Doe | NaN | Male | 10th | 64.0 | missing | 67 | 06/12/2022 | good student |
| 27 | 127 | Simran Singh | 16.0 | FEMALE | 12 | NaN | 76 | 80 | 06/12/2022 | poor |
| 28 | 128 | Sara O'Neil | 17.0 | female | 11 | NaN | 64 | 89 | 2022-06-10 | average |
| 29 | 129 | Patel R. | 17.0 | male | 11 | NaN | 64 | 83 | 2022/06/11 | Average |
In [114]:
mean_age = df["age"].mean().round(0)
In [115]:
df.loc[:, "age"] = df["age"].fillna(mean_age)
In [116]:
df
Out[116]:
| student_id | name | age | gender | grade | math_score | english_score | science_score | enrolled_date | remarks | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 100 | Jane Smith | 16.0 | female | 11 | 75.0 | NaN | 66 | 2022-06-10 | excellent |
| 1 | 101 | John Doe | 16.0 | Male | 10th | 74.0 | 95 | 94 | 10-06-2022 | GOOD |
| 2 | 102 | Chris P. | 17.0 | MALE | 10 | NaN | missing | 69 | 06/12/2022 | needs improvement |
| 3 | 103 | Jane Smith | 16.0 | FEMALE | 10 | NaN | missing | 62 | 10-06-2022 | average |
| 4 | 104 | Sara O'Neil | 16.0 | male | 11 | NaN | 96 | 64 | 2022-06-10 | GOOD |
| 5 | 105 | Mike O’Reilly | 16.0 | Female | 10 | NaN | NaN | 83 | 06/12/2022 | needs improvement |
| 6 | 106 | Ali Khan | 17.0 | female | 11 | 64.0 | NaN | 75 | 06/12/2022 | Good |
| 7 | 107 | Sara O'Neil | 17.0 | female | 12 | NaN | 63 | 62 | 2022/06/11 | excellent |
| 8 | 108 | Mike O’Reilly | 16.0 | Female | 12 | 80.0 | missing | 89 | 06/12/2022 | poor |
| 9 | 109 | Robert Brown | 17.0 | female | 12 | NaN | missing | 97 | 10-06-2022 | needs improvement |
| 10 | 110 | Lucy Gray | 17.0 | male | 11th | 65.0 | 67 | 100 | 06/12/2022 | excellent |
| 11 | 111 | Simran Singh | 16.0 | FEMALE | 11th | NaN | missing | 95 | 2022-06-10 | average |
| 12 | 112 | Patel R. | 17.0 | female | 11 | NaN | 87 | 89 | 2022-06-10 | poor |
| 13 | 113 | Patel R. | 17.0 | male | 10 | NaN | NaN | 98 | 06/12/2022 | Average |
| 14 | 114 | Ali Khan | 17.0 | male | 12 | NaN | 91 | 67 | 2022/06/11 | poor |
| 15 | 115 | Lucy Gray | 16.0 | Female | 12 | 65.0 | 91 | 94 | 06/12/2022 | Average |
| 16 | 116 | Chris P. | 17.0 | Female | 11 | NaN | NaN | 72 | 2022/06/11 | excellent |
| 17 | 117 | Ali Khan | 18.0 | male | 11th | 67.0 | 74 | 81 | 2022-06-10 | GOOD |
| 18 | 118 | Simran Singh | 17.0 | Male | 10 | 100.0 | 74 | 62 | 2022/06/11 | average |
| 19 | 119 | Patel R. | 17.0 | MALE | 11th | 73.0 | NaN | 90 | 06/12/2022 | Good |
| 20 | 120 | Sara O'Neil | 17.0 | Male | 10 | NaN | missing | 89 | 2022-06-10 | average |
| 21 | 121 | John Doe | 18.0 | female | 11 | 66.0 | 72 | 94 | 10-06-2022 | Average |
| 22 | 122 | Sara O'Neil | 17.0 | MALE | 11th | 75.0 | NaN | 66 | 2022-06-10 | good student |
| 23 | 123 | Jane Smith | 17.0 | Female | 11th | NaN | missing | 63 | 06/12/2022 | excellent |
| 24 | 124 | John Doe | 18.0 | MALE | 11th | NaN | NaN | 91 | 06/12/2022 | GOOD |
| 25 | 125 | Mike O’Reilly | 17.0 | Male | 12 | 94.0 | 80 | 63 | 10-06-2022 | Average |
| 26 | 126 | John Doe | 17.0 | Male | 10th | 64.0 | missing | 67 | 06/12/2022 | good student |
| 27 | 127 | Simran Singh | 16.0 | FEMALE | 12 | NaN | 76 | 80 | 06/12/2022 | poor |
| 28 | 128 | Sara O'Neil | 17.0 | female | 11 | NaN | 64 | 89 | 2022-06-10 | average |
| 29 | 129 | Patel R. | 17.0 | male | 11 | NaN | 64 | 83 | 2022/06/11 | Average |
In [117]:
df.loc[:, "gender"] = df["gender"].str.lower().str.title()
In [118]:
df
Out[118]:
| student_id | name | age | gender | grade | math_score | english_score | science_score | enrolled_date | remarks | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 100 | Jane Smith | 16.0 | Female | 11 | 75.0 | NaN | 66 | 2022-06-10 | excellent |
| 1 | 101 | John Doe | 16.0 | Male | 10th | 74.0 | 95 | 94 | 10-06-2022 | GOOD |
| 2 | 102 | Chris P. | 17.0 | Male | 10 | NaN | missing | 69 | 06/12/2022 | needs improvement |
| 3 | 103 | Jane Smith | 16.0 | Female | 10 | NaN | missing | 62 | 10-06-2022 | average |
| 4 | 104 | Sara O'Neil | 16.0 | Male | 11 | NaN | 96 | 64 | 2022-06-10 | GOOD |
| 5 | 105 | Mike O’Reilly | 16.0 | Female | 10 | NaN | NaN | 83 | 06/12/2022 | needs improvement |
| 6 | 106 | Ali Khan | 17.0 | Female | 11 | 64.0 | NaN | 75 | 06/12/2022 | Good |
| 7 | 107 | Sara O'Neil | 17.0 | Female | 12 | NaN | 63 | 62 | 2022/06/11 | excellent |
| 8 | 108 | Mike O’Reilly | 16.0 | Female | 12 | 80.0 | missing | 89 | 06/12/2022 | poor |
| 9 | 109 | Robert Brown | 17.0 | Female | 12 | NaN | missing | 97 | 10-06-2022 | needs improvement |
| 10 | 110 | Lucy Gray | 17.0 | Male | 11th | 65.0 | 67 | 100 | 06/12/2022 | excellent |
| 11 | 111 | Simran Singh | 16.0 | Female | 11th | NaN | missing | 95 | 2022-06-10 | average |
| 12 | 112 | Patel R. | 17.0 | Female | 11 | NaN | 87 | 89 | 2022-06-10 | poor |
| 13 | 113 | Patel R. | 17.0 | Male | 10 | NaN | NaN | 98 | 06/12/2022 | Average |
| 14 | 114 | Ali Khan | 17.0 | Male | 12 | NaN | 91 | 67 | 2022/06/11 | poor |
| 15 | 115 | Lucy Gray | 16.0 | Female | 12 | 65.0 | 91 | 94 | 06/12/2022 | Average |
| 16 | 116 | Chris P. | 17.0 | Female | 11 | NaN | NaN | 72 | 2022/06/11 | excellent |
| 17 | 117 | Ali Khan | 18.0 | Male | 11th | 67.0 | 74 | 81 | 2022-06-10 | GOOD |
| 18 | 118 | Simran Singh | 17.0 | Male | 10 | 100.0 | 74 | 62 | 2022/06/11 | average |
| 19 | 119 | Patel R. | 17.0 | Male | 11th | 73.0 | NaN | 90 | 06/12/2022 | Good |
| 20 | 120 | Sara O'Neil | 17.0 | Male | 10 | NaN | missing | 89 | 2022-06-10 | average |
| 21 | 121 | John Doe | 18.0 | Female | 11 | 66.0 | 72 | 94 | 10-06-2022 | Average |
| 22 | 122 | Sara O'Neil | 17.0 | Male | 11th | 75.0 | NaN | 66 | 2022-06-10 | good student |
| 23 | 123 | Jane Smith | 17.0 | Female | 11th | NaN | missing | 63 | 06/12/2022 | excellent |
| 24 | 124 | John Doe | 18.0 | Male | 11th | NaN | NaN | 91 | 06/12/2022 | GOOD |
| 25 | 125 | Mike O’Reilly | 17.0 | Male | 12 | 94.0 | 80 | 63 | 10-06-2022 | Average |
| 26 | 126 | John Doe | 17.0 | Male | 10th | 64.0 | missing | 67 | 06/12/2022 | good student |
| 27 | 127 | Simran Singh | 16.0 | Female | 12 | NaN | 76 | 80 | 06/12/2022 | poor |
| 28 | 128 | Sara O'Neil | 17.0 | Female | 11 | NaN | 64 | 89 | 2022-06-10 | average |
| 29 | 129 | Patel R. | 17.0 | Male | 11 | NaN | 64 | 83 | 2022/06/11 | Average |
In [119]:
df["grade"].unique()
Out[119]:
array(['11', '10th', '10', '12', '11th'], dtype=object)
In [120]:
df.loc[:, "grade"] = df["grade"].replace({'10th' : '10', '11th' : '11'})
In [121]:
df
Out[121]:
| student_id | name | age | gender | grade | math_score | english_score | science_score | enrolled_date | remarks | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 100 | Jane Smith | 16.0 | Female | 11 | 75.0 | NaN | 66 | 2022-06-10 | excellent |
| 1 | 101 | John Doe | 16.0 | Male | 10 | 74.0 | 95 | 94 | 10-06-2022 | GOOD |
| 2 | 102 | Chris P. | 17.0 | Male | 10 | NaN | missing | 69 | 06/12/2022 | needs improvement |
| 3 | 103 | Jane Smith | 16.0 | Female | 10 | NaN | missing | 62 | 10-06-2022 | average |
| 4 | 104 | Sara O'Neil | 16.0 | Male | 11 | NaN | 96 | 64 | 2022-06-10 | GOOD |
| 5 | 105 | Mike O’Reilly | 16.0 | Female | 10 | NaN | NaN | 83 | 06/12/2022 | needs improvement |
| 6 | 106 | Ali Khan | 17.0 | Female | 11 | 64.0 | NaN | 75 | 06/12/2022 | Good |
| 7 | 107 | Sara O'Neil | 17.0 | Female | 12 | NaN | 63 | 62 | 2022/06/11 | excellent |
| 8 | 108 | Mike O’Reilly | 16.0 | Female | 12 | 80.0 | missing | 89 | 06/12/2022 | poor |
| 9 | 109 | Robert Brown | 17.0 | Female | 12 | NaN | missing | 97 | 10-06-2022 | needs improvement |
| 10 | 110 | Lucy Gray | 17.0 | Male | 11 | 65.0 | 67 | 100 | 06/12/2022 | excellent |
| 11 | 111 | Simran Singh | 16.0 | Female | 11 | NaN | missing | 95 | 2022-06-10 | average |
| 12 | 112 | Patel R. | 17.0 | Female | 11 | NaN | 87 | 89 | 2022-06-10 | poor |
| 13 | 113 | Patel R. | 17.0 | Male | 10 | NaN | NaN | 98 | 06/12/2022 | Average |
| 14 | 114 | Ali Khan | 17.0 | Male | 12 | NaN | 91 | 67 | 2022/06/11 | poor |
| 15 | 115 | Lucy Gray | 16.0 | Female | 12 | 65.0 | 91 | 94 | 06/12/2022 | Average |
| 16 | 116 | Chris P. | 17.0 | Female | 11 | NaN | NaN | 72 | 2022/06/11 | excellent |
| 17 | 117 | Ali Khan | 18.0 | Male | 11 | 67.0 | 74 | 81 | 2022-06-10 | GOOD |
| 18 | 118 | Simran Singh | 17.0 | Male | 10 | 100.0 | 74 | 62 | 2022/06/11 | average |
| 19 | 119 | Patel R. | 17.0 | Male | 11 | 73.0 | NaN | 90 | 06/12/2022 | Good |
| 20 | 120 | Sara O'Neil | 17.0 | Male | 10 | NaN | missing | 89 | 2022-06-10 | average |
| 21 | 121 | John Doe | 18.0 | Female | 11 | 66.0 | 72 | 94 | 10-06-2022 | Average |
| 22 | 122 | Sara O'Neil | 17.0 | Male | 11 | 75.0 | NaN | 66 | 2022-06-10 | good student |
| 23 | 123 | Jane Smith | 17.0 | Female | 11 | NaN | missing | 63 | 06/12/2022 | excellent |
| 24 | 124 | John Doe | 18.0 | Male | 11 | NaN | NaN | 91 | 06/12/2022 | GOOD |
| 25 | 125 | Mike O’Reilly | 17.0 | Male | 12 | 94.0 | 80 | 63 | 10-06-2022 | Average |
| 26 | 126 | John Doe | 17.0 | Male | 10 | 64.0 | missing | 67 | 06/12/2022 | good student |
| 27 | 127 | Simran Singh | 16.0 | Female | 12 | NaN | 76 | 80 | 06/12/2022 | poor |
| 28 | 128 | Sara O'Neil | 17.0 | Female | 11 | NaN | 64 | 89 | 2022-06-10 | average |
| 29 | 129 | Patel R. | 17.0 | Male | 11 | NaN | 64 | 83 | 2022/06/11 | Average |
In [122]:
mean_math = df["math_score"].mean()
mean_math
Out[122]:
np.float64(74.0)
In [123]:
df.loc[:, "math_score"] = df["math_score"].fillna(mean_math)
In [124]:
df
Out[124]:
| student_id | name | age | gender | grade | math_score | english_score | science_score | enrolled_date | remarks | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 100 | Jane Smith | 16.0 | Female | 11 | 75.0 | NaN | 66 | 2022-06-10 | excellent |
| 1 | 101 | John Doe | 16.0 | Male | 10 | 74.0 | 95 | 94 | 10-06-2022 | GOOD |
| 2 | 102 | Chris P. | 17.0 | Male | 10 | 74.0 | missing | 69 | 06/12/2022 | needs improvement |
| 3 | 103 | Jane Smith | 16.0 | Female | 10 | 74.0 | missing | 62 | 10-06-2022 | average |
| 4 | 104 | Sara O'Neil | 16.0 | Male | 11 | 74.0 | 96 | 64 | 2022-06-10 | GOOD |
| 5 | 105 | Mike O’Reilly | 16.0 | Female | 10 | 74.0 | NaN | 83 | 06/12/2022 | needs improvement |
| 6 | 106 | Ali Khan | 17.0 | Female | 11 | 64.0 | NaN | 75 | 06/12/2022 | Good |
| 7 | 107 | Sara O'Neil | 17.0 | Female | 12 | 74.0 | 63 | 62 | 2022/06/11 | excellent |
| 8 | 108 | Mike O’Reilly | 16.0 | Female | 12 | 80.0 | missing | 89 | 06/12/2022 | poor |
| 9 | 109 | Robert Brown | 17.0 | Female | 12 | 74.0 | missing | 97 | 10-06-2022 | needs improvement |
| 10 | 110 | Lucy Gray | 17.0 | Male | 11 | 65.0 | 67 | 100 | 06/12/2022 | excellent |
| 11 | 111 | Simran Singh | 16.0 | Female | 11 | 74.0 | missing | 95 | 2022-06-10 | average |
| 12 | 112 | Patel R. | 17.0 | Female | 11 | 74.0 | 87 | 89 | 2022-06-10 | poor |
| 13 | 113 | Patel R. | 17.0 | Male | 10 | 74.0 | NaN | 98 | 06/12/2022 | Average |
| 14 | 114 | Ali Khan | 17.0 | Male | 12 | 74.0 | 91 | 67 | 2022/06/11 | poor |
| 15 | 115 | Lucy Gray | 16.0 | Female | 12 | 65.0 | 91 | 94 | 06/12/2022 | Average |
| 16 | 116 | Chris P. | 17.0 | Female | 11 | 74.0 | NaN | 72 | 2022/06/11 | excellent |
| 17 | 117 | Ali Khan | 18.0 | Male | 11 | 67.0 | 74 | 81 | 2022-06-10 | GOOD |
| 18 | 118 | Simran Singh | 17.0 | Male | 10 | 100.0 | 74 | 62 | 2022/06/11 | average |
| 19 | 119 | Patel R. | 17.0 | Male | 11 | 73.0 | NaN | 90 | 06/12/2022 | Good |
| 20 | 120 | Sara O'Neil | 17.0 | Male | 10 | 74.0 | missing | 89 | 2022-06-10 | average |
| 21 | 121 | John Doe | 18.0 | Female | 11 | 66.0 | 72 | 94 | 10-06-2022 | Average |
| 22 | 122 | Sara O'Neil | 17.0 | Male | 11 | 75.0 | NaN | 66 | 2022-06-10 | good student |
| 23 | 123 | Jane Smith | 17.0 | Female | 11 | 74.0 | missing | 63 | 06/12/2022 | excellent |
| 24 | 124 | John Doe | 18.0 | Male | 11 | 74.0 | NaN | 91 | 06/12/2022 | GOOD |
| 25 | 125 | Mike O’Reilly | 17.0 | Male | 12 | 94.0 | 80 | 63 | 10-06-2022 | Average |
| 26 | 126 | John Doe | 17.0 | Male | 10 | 64.0 | missing | 67 | 06/12/2022 | good student |
| 27 | 127 | Simran Singh | 16.0 | Female | 12 | 74.0 | 76 | 80 | 06/12/2022 | poor |
| 28 | 128 | Sara O'Neil | 17.0 | Female | 11 | 74.0 | 64 | 89 | 2022-06-10 | average |
| 29 | 129 | Patel R. | 17.0 | Male | 11 | 74.0 | 64 | 83 | 2022/06/11 | Average |
In [125]:
df["english_score"]
Out[125]:
0 NaN 1 95 2 missing 3 missing 4 96 5 NaN 6 NaN 7 63 8 missing 9 missing 10 67 11 missing 12 87 13 NaN 14 91 15 91 16 NaN 17 74 18 74 19 NaN 20 missing 21 72 22 NaN 23 missing 24 NaN 25 80 26 missing 27 76 28 64 29 64 Name: english_score, dtype: object
In [126]:
df.loc[:, "english_score"] = pd.to_numeric(df["english_score"], errors = 'coerce')
In [127]:
df
Out[127]:
| student_id | name | age | gender | grade | math_score | english_score | science_score | enrolled_date | remarks | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 100 | Jane Smith | 16.0 | Female | 11 | 75.0 | NaN | 66 | 2022-06-10 | excellent |
| 1 | 101 | John Doe | 16.0 | Male | 10 | 74.0 | 95.0 | 94 | 10-06-2022 | GOOD |
| 2 | 102 | Chris P. | 17.0 | Male | 10 | 74.0 | NaN | 69 | 06/12/2022 | needs improvement |
| 3 | 103 | Jane Smith | 16.0 | Female | 10 | 74.0 | NaN | 62 | 10-06-2022 | average |
| 4 | 104 | Sara O'Neil | 16.0 | Male | 11 | 74.0 | 96.0 | 64 | 2022-06-10 | GOOD |
| 5 | 105 | Mike O’Reilly | 16.0 | Female | 10 | 74.0 | NaN | 83 | 06/12/2022 | needs improvement |
| 6 | 106 | Ali Khan | 17.0 | Female | 11 | 64.0 | NaN | 75 | 06/12/2022 | Good |
| 7 | 107 | Sara O'Neil | 17.0 | Female | 12 | 74.0 | 63.0 | 62 | 2022/06/11 | excellent |
| 8 | 108 | Mike O’Reilly | 16.0 | Female | 12 | 80.0 | NaN | 89 | 06/12/2022 | poor |
| 9 | 109 | Robert Brown | 17.0 | Female | 12 | 74.0 | NaN | 97 | 10-06-2022 | needs improvement |
| 10 | 110 | Lucy Gray | 17.0 | Male | 11 | 65.0 | 67.0 | 100 | 06/12/2022 | excellent |
| 11 | 111 | Simran Singh | 16.0 | Female | 11 | 74.0 | NaN | 95 | 2022-06-10 | average |
| 12 | 112 | Patel R. | 17.0 | Female | 11 | 74.0 | 87.0 | 89 | 2022-06-10 | poor |
| 13 | 113 | Patel R. | 17.0 | Male | 10 | 74.0 | NaN | 98 | 06/12/2022 | Average |
| 14 | 114 | Ali Khan | 17.0 | Male | 12 | 74.0 | 91.0 | 67 | 2022/06/11 | poor |
| 15 | 115 | Lucy Gray | 16.0 | Female | 12 | 65.0 | 91.0 | 94 | 06/12/2022 | Average |
| 16 | 116 | Chris P. | 17.0 | Female | 11 | 74.0 | NaN | 72 | 2022/06/11 | excellent |
| 17 | 117 | Ali Khan | 18.0 | Male | 11 | 67.0 | 74.0 | 81 | 2022-06-10 | GOOD |
| 18 | 118 | Simran Singh | 17.0 | Male | 10 | 100.0 | 74.0 | 62 | 2022/06/11 | average |
| 19 | 119 | Patel R. | 17.0 | Male | 11 | 73.0 | NaN | 90 | 06/12/2022 | Good |
| 20 | 120 | Sara O'Neil | 17.0 | Male | 10 | 74.0 | NaN | 89 | 2022-06-10 | average |
| 21 | 121 | John Doe | 18.0 | Female | 11 | 66.0 | 72.0 | 94 | 10-06-2022 | Average |
| 22 | 122 | Sara O'Neil | 17.0 | Male | 11 | 75.0 | NaN | 66 | 2022-06-10 | good student |
| 23 | 123 | Jane Smith | 17.0 | Female | 11 | 74.0 | NaN | 63 | 06/12/2022 | excellent |
| 24 | 124 | John Doe | 18.0 | Male | 11 | 74.0 | NaN | 91 | 06/12/2022 | GOOD |
| 25 | 125 | Mike O’Reilly | 17.0 | Male | 12 | 94.0 | 80.0 | 63 | 10-06-2022 | Average |
| 26 | 126 | John Doe | 17.0 | Male | 10 | 64.0 | NaN | 67 | 06/12/2022 | good student |
| 27 | 127 | Simran Singh | 16.0 | Female | 12 | 74.0 | 76.0 | 80 | 06/12/2022 | poor |
| 28 | 128 | Sara O'Neil | 17.0 | Female | 11 | 74.0 | 64.0 | 89 | 2022-06-10 | average |
| 29 | 129 | Patel R. | 17.0 | Male | 11 | 74.0 | 64.0 | 83 | 2022/06/11 | Average |
In [129]:
df.loc[:, "english_score"] = df["english_score"].fillna(df["english_score"].mean().round())
C:\Users\91851\AppData\Local\Temp\ipykernel_31976\2816007329.py:1: FutureWarning: Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`
df.loc[:, "english_score"] = df["english_score"].fillna(df["english_score"].mean().round())
In [130]:
df
Out[130]:
| student_id | name | age | gender | grade | math_score | english_score | science_score | enrolled_date | remarks | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 100 | Jane Smith | 16.0 | Female | 11 | 75.0 | 78.0 | 66 | 2022-06-10 | excellent |
| 1 | 101 | John Doe | 16.0 | Male | 10 | 74.0 | 95.0 | 94 | 10-06-2022 | GOOD |
| 2 | 102 | Chris P. | 17.0 | Male | 10 | 74.0 | 78.0 | 69 | 06/12/2022 | needs improvement |
| 3 | 103 | Jane Smith | 16.0 | Female | 10 | 74.0 | 78.0 | 62 | 10-06-2022 | average |
| 4 | 104 | Sara O'Neil | 16.0 | Male | 11 | 74.0 | 96.0 | 64 | 2022-06-10 | GOOD |
| 5 | 105 | Mike O’Reilly | 16.0 | Female | 10 | 74.0 | 78.0 | 83 | 06/12/2022 | needs improvement |
| 6 | 106 | Ali Khan | 17.0 | Female | 11 | 64.0 | 78.0 | 75 | 06/12/2022 | Good |
| 7 | 107 | Sara O'Neil | 17.0 | Female | 12 | 74.0 | 63.0 | 62 | 2022/06/11 | excellent |
| 8 | 108 | Mike O’Reilly | 16.0 | Female | 12 | 80.0 | 78.0 | 89 | 06/12/2022 | poor |
| 9 | 109 | Robert Brown | 17.0 | Female | 12 | 74.0 | 78.0 | 97 | 10-06-2022 | needs improvement |
| 10 | 110 | Lucy Gray | 17.0 | Male | 11 | 65.0 | 67.0 | 100 | 06/12/2022 | excellent |
| 11 | 111 | Simran Singh | 16.0 | Female | 11 | 74.0 | 78.0 | 95 | 2022-06-10 | average |
| 12 | 112 | Patel R. | 17.0 | Female | 11 | 74.0 | 87.0 | 89 | 2022-06-10 | poor |
| 13 | 113 | Patel R. | 17.0 | Male | 10 | 74.0 | 78.0 | 98 | 06/12/2022 | Average |
| 14 | 114 | Ali Khan | 17.0 | Male | 12 | 74.0 | 91.0 | 67 | 2022/06/11 | poor |
| 15 | 115 | Lucy Gray | 16.0 | Female | 12 | 65.0 | 91.0 | 94 | 06/12/2022 | Average |
| 16 | 116 | Chris P. | 17.0 | Female | 11 | 74.0 | 78.0 | 72 | 2022/06/11 | excellent |
| 17 | 117 | Ali Khan | 18.0 | Male | 11 | 67.0 | 74.0 | 81 | 2022-06-10 | GOOD |
| 18 | 118 | Simran Singh | 17.0 | Male | 10 | 100.0 | 74.0 | 62 | 2022/06/11 | average |
| 19 | 119 | Patel R. | 17.0 | Male | 11 | 73.0 | 78.0 | 90 | 06/12/2022 | Good |
| 20 | 120 | Sara O'Neil | 17.0 | Male | 10 | 74.0 | 78.0 | 89 | 2022-06-10 | average |
| 21 | 121 | John Doe | 18.0 | Female | 11 | 66.0 | 72.0 | 94 | 10-06-2022 | Average |
| 22 | 122 | Sara O'Neil | 17.0 | Male | 11 | 75.0 | 78.0 | 66 | 2022-06-10 | good student |
| 23 | 123 | Jane Smith | 17.0 | Female | 11 | 74.0 | 78.0 | 63 | 06/12/2022 | excellent |
| 24 | 124 | John Doe | 18.0 | Male | 11 | 74.0 | 78.0 | 91 | 06/12/2022 | GOOD |
| 25 | 125 | Mike O’Reilly | 17.0 | Male | 12 | 94.0 | 80.0 | 63 | 10-06-2022 | Average |
| 26 | 126 | John Doe | 17.0 | Male | 10 | 64.0 | 78.0 | 67 | 06/12/2022 | good student |
| 27 | 127 | Simran Singh | 16.0 | Female | 12 | 74.0 | 76.0 | 80 | 06/12/2022 | poor |
| 28 | 128 | Sara O'Neil | 17.0 | Female | 11 | 74.0 | 64.0 | 89 | 2022-06-10 | average |
| 29 | 129 | Patel R. | 17.0 | Male | 11 | 74.0 | 64.0 | 83 | 2022/06/11 | Average |
In [131]:
df["enrolled_date"].unique()
Out[131]:
array(['2022-06-10', '10-06-2022', '06/12/2022', '2022/06/11'],
dtype=object)
In [132]:
yyyy-mm-dd
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[132], line 1 ----> 1 yyyy-mm-dd NameError: name 'yyyy' is not defined
In [133]:
dict_replace = {
'10-06-2022': '2022-06-10',
'06/12/2022': '2022-06-12',
'2022/06/11': '2022-06-11'
}
In [134]:
df.loc[:, "enrolled_date"] = df["enrolled_date"].replace(dict_replace)
In [147]:
df
Out[147]:
| student_id | name | age | gender | grade | math_score | english_score | science_score | enrolled_date | remarks | enrolled_date1 | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 100 | Jane Smith | 16.0 | Female | 11 | 75.0 | 78.0 | 66 | 2022-06-10 | excellent | 2022-06-10 |
| 1 | 101 | John Doe | 16.0 | Male | 10 | 74.0 | 95.0 | 94 | 2022-06-10 | GOOD | 2022-06-10 |
| 2 | 102 | Chris P. | 17.0 | Male | 10 | 74.0 | 78.0 | 69 | 2022-06-12 | needs improvement | 2022-06-12 |
| 3 | 103 | Jane Smith | 16.0 | Female | 10 | 74.0 | 78.0 | 62 | 2022-06-10 | average | 2022-06-10 |
| 4 | 104 | Sara O'Neil | 16.0 | Male | 11 | 74.0 | 96.0 | 64 | 2022-06-10 | GOOD | 2022-06-10 |
| 5 | 105 | Mike O’Reilly | 16.0 | Female | 10 | 74.0 | 78.0 | 83 | 2022-06-12 | needs improvement | 2022-06-12 |
| 6 | 106 | Ali Khan | 17.0 | Female | 11 | 64.0 | 78.0 | 75 | 2022-06-12 | Good | 2022-06-12 |
| 7 | 107 | Sara O'Neil | 17.0 | Female | 12 | 74.0 | 63.0 | 62 | 2022-06-11 | excellent | 2022-06-11 |
| 8 | 108 | Mike O’Reilly | 16.0 | Female | 12 | 80.0 | 78.0 | 89 | 2022-06-12 | poor | 2022-06-12 |
| 9 | 109 | Robert Brown | 17.0 | Female | 12 | 74.0 | 78.0 | 97 | 2022-06-10 | needs improvement | 2022-06-10 |
| 10 | 110 | Lucy Gray | 17.0 | Male | 11 | 65.0 | 67.0 | 100 | 2022-06-12 | excellent | 2022-06-12 |
| 11 | 111 | Simran Singh | 16.0 | Female | 11 | 74.0 | 78.0 | 95 | 2022-06-10 | average | 2022-06-10 |
| 12 | 112 | Patel R. | 17.0 | Female | 11 | 74.0 | 87.0 | 89 | 2022-06-10 | poor | 2022-06-10 |
| 13 | 113 | Patel R. | 17.0 | Male | 10 | 74.0 | 78.0 | 98 | 2022-06-12 | Average | 2022-06-12 |
| 14 | 114 | Ali Khan | 17.0 | Male | 12 | 74.0 | 91.0 | 67 | 2022-06-11 | poor | 2022-06-11 |
| 15 | 115 | Lucy Gray | 16.0 | Female | 12 | 65.0 | 91.0 | 94 | 2022-06-12 | Average | 2022-06-12 |
| 16 | 116 | Chris P. | 17.0 | Female | 11 | 74.0 | 78.0 | 72 | 2022-06-11 | excellent | 2022-06-11 |
| 17 | 117 | Ali Khan | 18.0 | Male | 11 | 67.0 | 74.0 | 81 | 2022-06-10 | GOOD | 2022-06-10 |
| 18 | 118 | Simran Singh | 17.0 | Male | 10 | 100.0 | 74.0 | 62 | 2022-06-11 | average | 2022-06-11 |
| 19 | 119 | Patel R. | 17.0 | Male | 11 | 73.0 | 78.0 | 90 | 2022-06-12 | Good | 2022-06-12 |
| 20 | 120 | Sara O'Neil | 17.0 | Male | 10 | 74.0 | 78.0 | 89 | 2022-06-10 | average | 2022-06-10 |
| 21 | 121 | John Doe | 18.0 | Female | 11 | 66.0 | 72.0 | 94 | 2022-06-10 | Average | 2022-06-10 |
| 22 | 122 | Sara O'Neil | 17.0 | Male | 11 | 75.0 | 78.0 | 66 | 2022-06-10 | good student | 2022-06-10 |
| 23 | 123 | Jane Smith | 17.0 | Female | 11 | 74.0 | 78.0 | 63 | 2022-06-12 | excellent | 2022-06-12 |
| 24 | 124 | John Doe | 18.0 | Male | 11 | 74.0 | 78.0 | 91 | 2022-06-12 | GOOD | 2022-06-12 |
| 25 | 125 | Mike O’Reilly | 17.0 | Male | 12 | 94.0 | 80.0 | 63 | 2022-06-10 | Average | 2022-06-10 |
| 26 | 126 | John Doe | 17.0 | Male | 10 | 64.0 | 78.0 | 67 | 2022-06-12 | good student | 2022-06-12 |
| 27 | 127 | Simran Singh | 16.0 | Female | 12 | 74.0 | 76.0 | 80 | 2022-06-12 | poor | 2022-06-12 |
| 28 | 128 | Sara O'Neil | 17.0 | Female | 11 | 74.0 | 64.0 | 89 | 2022-06-10 | average | 2022-06-10 |
| 29 | 129 | Patel R. | 17.0 | Male | 11 | 74.0 | 64.0 | 83 | 2022-06-11 | Average | 2022-06-11 |
In [145]:
converted_date = pd.to_datetime(df["enrolled_date"])
In [146]:
df["enrolled_date"]
Out[146]:
0 2022-06-10 1 2022-06-10 2 2022-06-12 3 2022-06-10 4 2022-06-10 5 2022-06-12 6 2022-06-12 7 2022-06-11 8 2022-06-12 9 2022-06-10 10 2022-06-12 11 2022-06-10 12 2022-06-10 13 2022-06-12 14 2022-06-11 15 2022-06-12 16 2022-06-11 17 2022-06-10 18 2022-06-11 19 2022-06-12 20 2022-06-10 21 2022-06-10 22 2022-06-10 23 2022-06-12 24 2022-06-12 25 2022-06-10 26 2022-06-12 27 2022-06-12 28 2022-06-10 29 2022-06-11 Name: enrolled_date, dtype: datetime64[ns]
In [150]:
df["remarks"].unique()
Out[150]:
array(['excellent', 'GOOD', 'needs improvement', 'average', 'Good',
'poor', 'Average', 'good student'], dtype=object)
In [155]:
dict_remark = {
'excellent' : 'Excellent',
'GOOD' : 'Good',
'needs improvement': 'Poor',
'poor' : 'Poor',
'average' : 'Average',
'good student': 'Good'
}
In [158]:
df.loc[:, 'remarks'] = df["remarks"].replace(dict_remark)
In [159]:
df
Out[159]:
| student_id | name | age | gender | grade | math_score | english_score | science_score | enrolled_date | remarks | enrolled_date1 | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 100 | Jane Smith | 16.0 | Female | 11 | 75.0 | 78.0 | 66 | 2022-06-10 | Excellent | 2022-06-10 |
| 1 | 101 | John Doe | 16.0 | Male | 10 | 74.0 | 95.0 | 94 | 2022-06-10 | Good | 2022-06-10 |
| 2 | 102 | Chris P. | 17.0 | Male | 10 | 74.0 | 78.0 | 69 | 2022-06-12 | Poor | 2022-06-12 |
| 3 | 103 | Jane Smith | 16.0 | Female | 10 | 74.0 | 78.0 | 62 | 2022-06-10 | Average | 2022-06-10 |
| 4 | 104 | Sara O'Neil | 16.0 | Male | 11 | 74.0 | 96.0 | 64 | 2022-06-10 | Good | 2022-06-10 |
| 5 | 105 | Mike O’Reilly | 16.0 | Female | 10 | 74.0 | 78.0 | 83 | 2022-06-12 | Poor | 2022-06-12 |
| 6 | 106 | Ali Khan | 17.0 | Female | 11 | 64.0 | 78.0 | 75 | 2022-06-12 | Good | 2022-06-12 |
| 7 | 107 | Sara O'Neil | 17.0 | Female | 12 | 74.0 | 63.0 | 62 | 2022-06-11 | Excellent | 2022-06-11 |
| 8 | 108 | Mike O’Reilly | 16.0 | Female | 12 | 80.0 | 78.0 | 89 | 2022-06-12 | Poor | 2022-06-12 |
| 9 | 109 | Robert Brown | 17.0 | Female | 12 | 74.0 | 78.0 | 97 | 2022-06-10 | Poor | 2022-06-10 |
| 10 | 110 | Lucy Gray | 17.0 | Male | 11 | 65.0 | 67.0 | 100 | 2022-06-12 | Excellent | 2022-06-12 |
| 11 | 111 | Simran Singh | 16.0 | Female | 11 | 74.0 | 78.0 | 95 | 2022-06-10 | Average | 2022-06-10 |
| 12 | 112 | Patel R. | 17.0 | Female | 11 | 74.0 | 87.0 | 89 | 2022-06-10 | Poor | 2022-06-10 |
| 13 | 113 | Patel R. | 17.0 | Male | 10 | 74.0 | 78.0 | 98 | 2022-06-12 | Average | 2022-06-12 |
| 14 | 114 | Ali Khan | 17.0 | Male | 12 | 74.0 | 91.0 | 67 | 2022-06-11 | Poor | 2022-06-11 |
| 15 | 115 | Lucy Gray | 16.0 | Female | 12 | 65.0 | 91.0 | 94 | 2022-06-12 | Average | 2022-06-12 |
| 16 | 116 | Chris P. | 17.0 | Female | 11 | 74.0 | 78.0 | 72 | 2022-06-11 | Excellent | 2022-06-11 |
| 17 | 117 | Ali Khan | 18.0 | Male | 11 | 67.0 | 74.0 | 81 | 2022-06-10 | Good | 2022-06-10 |
| 18 | 118 | Simran Singh | 17.0 | Male | 10 | 100.0 | 74.0 | 62 | 2022-06-11 | Average | 2022-06-11 |
| 19 | 119 | Patel R. | 17.0 | Male | 11 | 73.0 | 78.0 | 90 | 2022-06-12 | Good | 2022-06-12 |
| 20 | 120 | Sara O'Neil | 17.0 | Male | 10 | 74.0 | 78.0 | 89 | 2022-06-10 | Average | 2022-06-10 |
| 21 | 121 | John Doe | 18.0 | Female | 11 | 66.0 | 72.0 | 94 | 2022-06-10 | Average | 2022-06-10 |
| 22 | 122 | Sara O'Neil | 17.0 | Male | 11 | 75.0 | 78.0 | 66 | 2022-06-10 | Good | 2022-06-10 |
| 23 | 123 | Jane Smith | 17.0 | Female | 11 | 74.0 | 78.0 | 63 | 2022-06-12 | Excellent | 2022-06-12 |
| 24 | 124 | John Doe | 18.0 | Male | 11 | 74.0 | 78.0 | 91 | 2022-06-12 | Good | 2022-06-12 |
| 25 | 125 | Mike O’Reilly | 17.0 | Male | 12 | 94.0 | 80.0 | 63 | 2022-06-10 | Average | 2022-06-10 |
| 26 | 126 | John Doe | 17.0 | Male | 10 | 64.0 | 78.0 | 67 | 2022-06-12 | Good | 2022-06-12 |
| 27 | 127 | Simran Singh | 16.0 | Female | 12 | 74.0 | 76.0 | 80 | 2022-06-12 | Poor | 2022-06-12 |
| 28 | 128 | Sara O'Neil | 17.0 | Female | 11 | 74.0 | 64.0 | 89 | 2022-06-10 | Average | 2022-06-10 |
| 29 | 129 | Patel R. | 17.0 | Male | 11 | 74.0 | 64.0 | 83 | 2022-06-11 | Average | 2022-06-11 |
In [157]:
df["remarks"].replace(dict_remark).unique()
Out[157]:
array(['Excellent', 'Good', 'Poor', 'Average'], dtype=object)
In [162]:
df.iloc[:,-2]
Out[162]:
0 Excellent 1 Good 2 Poor 3 Average 4 Good 5 Poor 6 Good 7 Excellent 8 Poor 9 Poor 10 Excellent 11 Average 12 Poor 13 Average 14 Poor 15 Average 16 Excellent 17 Good 18 Average 19 Good 20 Average 21 Average 22 Good 23 Excellent 24 Good 25 Average 26 Good 27 Poor 28 Average 29 Average Name: remarks, dtype: object
In [163]:
pd.__version__
Out[163]:
'2.2.3'
In [166]:
df.at[1, "remarks"]
Out[166]:
'Good'
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
import pandas as pd
# Load dataset
df = pd.read_csv("dirty_students_data_30rows.csv")
# 1. View basic info
print("Initial Data Overview:")
print(df.info())
print("\nMissing Values:")
print(df.isnull().sum())
print("\nSample Rows:")
print(df.head())
# 2. Drop exact duplicates
df = df.drop_duplicates()
# 3. Normalize 'gender' values
df['gender'] = df['gender'].str.strip().str.lower().map({
'male': 'Male',
'female': 'Female'
})
# 4. Normalize 'grade'
df['grade'] = df['grade'].replace({'10th': '10', '11th': '11'}).astype(str)
# 5. Convert 'english_score' to numeric, set errors='coerce' to convert 'missing' to NaN
df['english_score'] = pd.to_numeric(df['english_score'], errors='coerce')
# 6. Handle missing values (example: fill numeric NaNs with mean)
numeric_cols = ['age', 'math_score', 'english_score']
for col in numeric_cols:
df[col] = df[col].fillna(df[col].mean().round(1))
# 7. Convert 'enrolled_date' to datetime format
df['enrolled_date'] = pd.to_datetime(df['enrolled_date'], errors='coerce')
# 8. Capitalize student names
df['name'] = df['name'].str.title()
# 9. Normalize 'remarks'
df['remarks'] = df['remarks'].str.strip().str.lower().replace({
'good student': 'Good',
'good': 'Good',
'excellent': 'Excellent',
'average': 'Average',
'poor': 'Poor',
'needs improvement': 'Needs Improvement'
})
# 10. Final review
print("\nCleaned Data Sample:")
print(df.head())
print("\nData Types:")
print(df.dtypes)
In [ ]:
Comments
Post a Comment