YouTip LogoYouTip

Ml Python Libs

This chapter will introduce you in detail to the four core Python libraries in machine learning: NumPy, Pandas, Matplotlib, and Scikit-learn. **Machine learning libraries are like a professional toolbox**, each library has a specific purpose, and using them together can complete complex machine learning tasks. !(#) ### Roles of the Four Core Libraries * **(#)**: The foundation of numerical computing, providing efficient array operations * **(#)**: A powerful tool for data processing, providing data structures and analysis tools * **(#)**: The brush for data visualization, creating various charts * **(#)**: The Swiss Army knife of machine learning, providing a complete ML toolchain * * * ## NumPy: The Foundation of Numerical Computing ### What is NumPy? **NumPy is like a calculator for mathematical computations**, but infinitely more powerful. It is the foundational library for scientific computing in Python, providing efficient multi-dimensional array objects. ### Core Concepts of NumPy #### 1. Array ## Example # NumPy Array Basic Operations import numpy as np # Different ways to create arrays print("=== NumPy Array Creation ===") # Create from list arr1 = np.array([1,2,3,4,5]) print(f"Created from list: {arr1}") # Create arithmetic array arr2 = np.arange(0,10,2)# 0 to 10, step 2 print(f"Arithmetic array: {arr2}") # Create evenly spaced array arr3 = np.linspace(0,1,5)# 0 to 1, 5 points print(f"Evenly spaced array: {arr3}") # Create special arrays zeros_arr = np.zeros((2,3))# 2x3 zero array ones_arr = np.ones((2,3))# 2x3 ones array identity_arr = np.eye(3)# 3x3 identity matrix print(f"Zero array:n{zeros_arr}") print(f"Ones array:n{ones_arr}") print(f"Identity matrix:n{identity_arr}") #### 2. Array Operations ## Example # Basic Array Operations print("n=== Array Basic Operations ===") # Array attributes arr = np.array([[1,2,3],[4,5,6]]) print(f"Array:n{arr}") print(f"Shape: {arr.shape}") print(f"Dimensions: {arr.ndim}") print(f"Number of elements: {arr.size}") print(f"Data type: {arr.dtype}") # Array indexing and slicing print(f"First row: {arr}") print(f"First column: {arr[:, 0]}") print(f"Element [1,2]: {arr[1, 2]}") # Array operations arr1 = np.array([1,2,3]) arr2 = np.array([4,5,6]) print(f"Addition: {arr1 + arr2}") print(f"Multiplication: {arr1 * arr2}") print(f"Dot product: {np.dot(arr1, arr2)}") # Statistical functions data = np.array([1,2,3,4,5,6,7,8,9,10]) print(f"Mean: {np.mean(data)}") print(f"Standard deviation: {np.std(data)}") print(f"Maximum: {np.max(data)}") print(f"Minimum: {np.min(data)}") print(f"Median: {np.median(data)}") #### NumPy Practical Application Example ## Example # NumPy Practical Application: Simple Linear Regression def numpy_linear_regression(): """Implement simple linear regression using NumPy""" # Generate sample data np.random.seed(42) X =2 * np.random.rand(100,1)# Features y =4 + 3 * X + np.random.randn(100,1)# Labels + noise # Add x0 = 1 to X X_b = np.c_[np.ones((100,1)), X]# Add bias term # Solve using normal equation: ΞΈ = (X^T * X)^(-1) * X^T * y theta_best = np.linalg.inv(X_b.T.dot(X_b)).dot(X_b.T).dot(y) print("=== NumPy Linear Regression Example ===") print(f"Learned parameters: intercept={theta_best:.2f}, slope={theta_best:.2f}") # Prediction X_new = np.array([,]) X_new_b = np.c_[np.ones((2,1)), X_new] y_predict = X_new_b.dot(theta_best) print(f"Predictions: when X=0, y={y_predict:.2f}; when X=2, y={y_predict:.2f}") return theta_best, X, y # Run example theta, X, y = numpy_linear_regression() * * * ## Pandas: A Powerful Tool for Data Processing ### What is Pandas? **Pandas is like a Swiss Army knife for data processing**, providing powerful data structures and data analysis tools, especially suitable for handling tabular data. ### Core Data Structures of Pandas #### 1. Series (One-dimensional Data) ## Example # Pandas Series Basic Operations import pandas as pd print("=== Pandas Series ===") # Create Series from list s1 = pd.Series([1,2,3,4,5]) print(f"Created from list:n{s1}") # Series with index s2 = pd.Series([10,20,30], index=['a','b','c']) print(f"nSeries with index:n{s2}") # Create Series from dictionary s3 = pd.Series({'Math': 90,'English': 85,'Physics': 88}) print(f"nCreated from dictionary:n{s3}") # Series operations print(f"nAccess element: s2['b'] = {s2['b']}") print(f"Slicing: s2[0:2] =n{s2[0:2]}") print(f"Statistics:n{s2.describe()}") #### 2. DataFrame (Two-dimensional Data) ## Example # Pandas DataFrame Basic Operations print("n=== Pandas DataFrame ===") # Create DataFrame data ={ 'Name': ['Zhang San','Li Si','Wang Wu','Zhao Liu'], 'Age': [25,30,35,28], 'City': ['Beijing','Shanghai','Guangzhou','Shenzhen'], 'Salary': [15000,20000,18000,22000] } df = pd.DataFrame(data) print("Original DataFrame:") print(df) # DataFrame basic operations print(f"nDataFrame shape: {df.shape}") print(f"nColumn names: {list(df.columns)}") print(f"nData types:n{df.dtypes}") # Select data print(f"nSelect 'Name' column:n{df['Name']}") print(f"nSelect first two rows:n{df.head(2)}") print(f"nSelect rows where age > 28:n{df[df['Age'] > 28]}") # Statistics print(f"nStatistics for numeric columns:n{df.describe()}") # Add new column df['Annual Salary']= df['Salary'] * 12 print(f"nAfter adding annual salary column:n{df}") #### Pandas Data Processing Example ## Example # Pandas Data Processing Complete Example def pandas_data_processing(): """Demonstrate complete Pandas data processing workflow""" print("=== Pandas Data Processing Example ===") # 1. Create sample data np.random.seed(42) n_samples =1000 data ={ 'StudentID': range(1, n_samples + 1), 'Name': [f'Student{i}'for i in range(1, n_samples + 1)], 'Age': np.random.randint(18,25, n_samples), 'Gender': np.random.choice(['Male','Female'], n_samples), 'MathScore': np.random.normal(75,15, n_samples), 'EnglishScore': np.random.normal(80,12, n_samples), 'PhysicsScore': np.random.normal(72,18, n_samples), 'Class': np.random.choice(['Class 1','Class 2','Class 3'], n_samples) } df = pd.DataFrame(data) # 2. Data cleaning print("Original data shape:", df.shape) # Handle outliers (scores should be between 0-100) score_columns =['MathScore','EnglishScore','PhysicsScore'] for col in score_columns: df= df.clip(0,100) # 3. Feature engineering # Calculate total and average scores df['TotalScore']= df.sum(axis=1) df['AverageScore']= df.mean(axis=1) # Add grade def get_grade(score): if score >=90: return'A' elif score >=80: return'B' elif score >=70: return'C' elif score >=60: return'D' else: return'F' df['Grade']= df['AverageScore'].apply(get_grade) # 4. Data analysis print("n=== Data Analysis Results ===") # Basic statistics print("Average scores by subject:") print(df.mean()) # Analysis by class print("nAverage scores by class:") class_avg = df.groupby('Class')['AverageScore'].mean() print(class_avg) # Analysis by gender print("nGender distribution:") gender_count = df['Gender'].value_counts() print(gender_count) # Grade distribution print("nGrade distribution:") grade_dist = df['Grade'].value_counts().sort_index() print(grade_dist) # 5. Data filtering print("n=== Specific Data Filtering ===") # Excellent students (average score > 85) excellent_students = df[df['AverageScore']>85].head(5) print("Excellent students (top 5):") print(excellent_students[['Name','AverageScore','Grade']]) # Highest scoring student in each class print("nHighest scoring student in each class:") top_students = df.loc[df.groupby('Class')['AverageScore'].idxmax()] print(top_students[['Class','Name','AverageScore']]) return df # Run example student_df = pandas_data_processing() * * * ## Matplotlib: The Brush for Data Visualization ### What is Matplotlib? **Matplotlib is like an data artist's brush**, capable of transforming dull data into intuitive charts, helping us understand patterns and relationships in data. ### Matplotlib Basic Charts ## Example # Matplotlib Basic Chart Examples import matplotlib.pyplot as plt import numpy as np # Set Chinese font (prevent Chinese from displaying as squares) plt.rcParams['font.sans-serif']=['SimHei','Arial Unicode MS'] plt.rcParams['axes.unicode_minus']=False def matplotlib_basic_charts(): """Demonstrate Matplotlib basic charts""" print("=== Matplotlib Basic Chart Examples ===") # 1. Line chart plt.figure(figsize=(12,8)) plt.subplot(2,3,1) x = np.linspace(0,10,100) y1 = np.sin(x) y2 = np.cos(x) plt.plot(x, y1, label='sin(x)') plt.plot(x, y2, label='cos(x)') plt.title('Trigonometric Functions') plt.xlabel('x') plt.ylabel('y') plt.legend() plt.grid(True) # 2. Scatter plot plt.subplot(2,3,2) np.random.seed(42) x = np.random.randn(100) y =2 * x + np.random.randn(100) * 0.5 plt.scatter(x, y, alpha=0.6, c='blue') plt.title('Scatter Plot') plt.xlabel('X') plt.ylabel('Y') # 3. Bar chart plt.subplot(2,3,3) categories =['A','B','C','D','E'] values =[23,45,56,78,32] plt.bar(categories, values, color=['red','green','blue','orange','purple']) plt.title('Bar Chart') plt.xlabel('Category') plt.ylabel('Value') # 4. Histogram plt.subplot(2,3,4) data = np.random.normal(100,15,1000) plt.hist(data, bins=30, alpha=0.7, color='skyblue', edgecolor='black') plt.title('Histogram') plt.xlabel('Value') plt.ylabel('Frequency') # 5. Pie chart plt.subplot(2,3,5) sizes =[30,25,20,15,10] labels =['A','B','C','D','E'] colors =['gold','lightcoral','lightskyblue','lightgreen','plum'] plt.pie(sizes, labels=labels, colors=colors, autopct='%1.1f%%', startangle=90) plt.title('Pie Chart') # 6. Box plot plt.subplot(2,3,6) data1 = np.random.normal(0,1,100) data2 = np.random.normal(2,1,100) data3 = np.random.normal(-2,1,100) plt.boxplot([data1, data2, data3], labels=['Group 1','Group 2','Group 3']) plt.title('Box Plot') plt.ylabel('Value') plt.tight_layout() plt.show() print("Chart displayed!") # Run example matplotlib_basic_charts() #### Advanced Visualization Example ## Example # Advanced Visualization Example def advanced_visualization(): """Demonstrate advanced visualization techniques""" print("=== Advanced Visualization Example ===") # Create more complex data np.random.seed(42) n_points =200 # Generate correlated data x = np.random.randn(n_points) y =2 *
← Ml Data UnderstandingMl Applications β†’