From Beginner to Pro: Step-by-Step Learning Path
Version: 1.0 | Updated: July 2025 | Duration: 2-4 hours
Google Colaboratory (Colab) is a free cloud-based Jupyter notebook environment that allows you to write and execute Python code through your browser. It provides free access to computing resources including GPUs and TPUs.
# Option 1: Create new notebook # Click "New notebook" on the welcome screen # Option 2: From Google Drive # Go to drive.google.com โ New โ More โ Google Colaboratory
File, Edit, View, Insert, Runtime, Tools, Help
Add code/text cells, run buttons, sharing options
Code cells (for Python code), Text cells (for Markdown documentation)
Runtime type, hardware accelerator settings
# Your first Python code in Colab print("Hello, Google Colab!") # Basic calculations x = 10 y = 20 result = x + y print(f"The sum of {x} and {y} is {result}")
# This is a Markdown cell ## You can add: - **Bold text** - *Italic text* - `Code snippets` - [Links](https://colab.research.google.com) - Lists and tables ### Mathematical equations: $$E = mc^2$$
# Variables persist between cells name = "Data Analyst" age = 25 skills = ["Python", "Data Analysis", "Machine Learning"] print(f"I am a {name}, {age} years old") print("My skills:", ", ".join(skills))
# Simple script execution def analyze_data(data): """Simple data analysis function""" import statistics mean_value = statistics.mean(data) median_value = statistics.median(data) return { 'mean': mean_value, 'median': median_value, 'count': len(data) } # Test the function sample_data = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] results = analyze_data(sample_data) print("Analysis Results:", results)
# Upload and run external .py files from google.colab import files # Upload Python file uploaded = files.upload() # Run the uploaded script exec(open('your_script.py').read())
# Clone and run scripts from GitHub repository !git clone https://github.com/username/repository.git # Navigate to directory and run script %cd repository !python script_name.py
# Install single package !pip install pandas # Install multiple packages !pip install pandas numpy matplotlib seaborn # Install specific version !pip install pandas==1.5.0 # Install from requirements file !pip install -r requirements.txt
# Upgrade package !pip install --upgrade pandas # Install with specific options !pip install tensorflow-gpu --upgrade # List installed packages !pip list # Show package information !pip show pandas
# Check if package is installed try: import pandas as pd print("Pandas is installed") except ImportError: print("Installing pandas...") !pip install pandas import pandas as pd print("Pandas installed and imported successfully") # Create requirements file !pip freeze > requirements.txt
from google.colab import drive import os # Mount Google Drive drive.mount('/content/drive') # Verify mount if os.path.exists('/content/drive/MyDrive'): print("Google Drive mounted successfully!") print("Available folders:") os.listdir('/content/drive/MyDrive') else: print("Failed to mount Google Drive")
import pandas as pd # Read CSV from Google Drive drive_path = '/content/drive/MyDrive/Data/' file_name = 'dataset.csv' full_path = drive_path + file_name try: df = pd.read_csv(full_path) print(f"Successfully loaded {file_name}") print(f"Shape: {df.shape}") print(df.head()) except FileNotFoundError: print(f"File {file_name} not found in {drive_path}")
import pandas as pd import matplotlib.pyplot as plt # Create sample data data = { 'Name': ['Alice', 'Bob', 'Charlie', 'Diana'], 'Age': [25, 30, 35, 28], 'Salary': [50000, 60000, 70000, 55000] } df = pd.DataFrame(data) # Save to Google Drive drive_output_path = '/content/drive/MyDrive/Output/' # Create directory if it doesn't exist os.makedirs(drive_output_path, exist_ok=True) # Save DataFrame as CSV df.to_csv(drive_output_path + 'processed_data.csv', index=False)
from google.colab import files import pandas as pd # Create sample data df = pd.DataFrame({ 'A': range(1, 101), 'B': range(101, 201), 'C': range(201, 301) }) # Save as CSV df.to_csv('sample_data.csv', index=False) # Download the file files.download('sample_data.csv')
import pandas as pd from google.colab import files # Create multiple DataFrames df1 = pd.DataFrame({'Product': ['A', 'B', 'C'], 'Sales': [100, 200, 150]}) df2 = pd.DataFrame({'Month': ['Jan', 'Feb', 'Mar'], 'Revenue': [10000, 15000, 12000]}) # Create Excel file with multiple sheets with pd.ExcelWriter('comprehensive_report.xlsx', engine='openpyxl') as writer: df1.to_excel(writer, sheet_name='Sales_Data', index=False) df2.to_excel(writer, sheet_name='Revenue_Data', index=False) print("Excel file created with multiple sheets!") files.download('comprehensive_report.xlsx')
# Check and enable GPU import tensorflow as tf import numpy as np import time # Check if GPU is available print("GPU Available: ", tf.config.list_physical_devices('GPU')) # Set runtime to GPU: Runtime โ Change runtime type โ GPU def matrix_multiplication_cpu(): """CPU-based matrix multiplication""" a = np.random.random((1000, 1000)) b = np.random.random((1000, 1000)) start_time = time.time() result = np.dot(a, b) end_time = time.time() return end_time - start_time def matrix_multiplication_gpu(): """GPU-based matrix multiplication""" with tf.device('/GPU:0'): a = tf.random.normal((1000, 1000)) b = tf.random.normal((1000, 1000)) start_time = time.time() result = tf.matmul(a, b) end_time = time.time() return end_time - start_time cpu_time = matrix_multiplication_cpu() gpu_time = matrix_multiplication_gpu() print(f"CPU Time: {cpu_time:.4f} seconds") print(f"GPU Time: {gpu_time:.4f} seconds") print(f"Speedup: {cpu_time/gpu_time:.2f}x")
# IPython magic commands %timeit sum(range(100)) # Time a single line %%timeit # Time entire cell sum(range(100)) # System commands !ls -la # List files !pwd # Current directory !wget https://example.com/file.zip # Download files !unzip file.zip # Extract files # Environment variables import os os.environ['CUSTOM_VAR'] = 'value' !echo $CUSTOM_VAR # Memory usage %memit sum(range(100000)) # Load external scripts %load script.py
import pandas as pd import numpy as np from sklearn.preprocessing import StandardScaler from sklearn.model_selection import train_test_split from sklearn.ensemble import RandomForestClassifier class DataProcessor: """Advanced data processing pipeline""" def __init__(self, data_path): self.data_path = data_path self.df = None self.model = None def load_data(self): """Load and validate data""" try: self.df = pd.read_csv(self.data_path) print(f"Data loaded successfully: {self.df.shape}") return True except Exception as e: print(f"Error loading data: {e}") return False def explore_data(self): """Comprehensive data exploration""" if self.df is None: print("No data loaded") return print("=== Data Overview ===") print(self.df.info()) print("\n=== Statistical Summary ===") print(self.df.describe()) # Usage example # processor = DataProcessor('your_data.csv') # processor.load_data() # processor.explore_data()
# Memory management import gc def optimize_memory(): """Optimize memory usage""" gc.collect() # Force garbage collection # Monitor memory usage import psutil memory_usage = psutil.virtual_memory().percent print(f"Memory usage: {memory_usage}%") return memory_usage # Efficient data loading for large files def load_large_csv(filepath, chunksize=10000): """Load large CSV files in chunks""" chunks = [] for chunk in pd.read_csv(filepath, chunksize=chunksize): processed_chunk = chunk.dropna() chunks.append(processed_chunk) return pd.concat(chunks, ignore_index=True)
import logging import traceback # Set up logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) def robust_data_processing(filepath): """Example of robust error handling""" try: # Load data df = pd.read_csv(filepath) logger.info(f"Successfully loaded {filepath}") # Validate data if df.empty: raise ValueError("DataFrame is empty") return df except FileNotFoundError: logger.error(f"File {filepath} not found") return None except Exception as e: logger.error(f"Unexpected error: {e}") return None
Solution: Save work frequently to Google Drive and use checkpoints
# Automatic saving function def auto_save_checkpoint(data, checkpoint_name): """Save checkpoint to Google Drive""" import pickle import os checkpoint_path = f'/content/drive/MyDrive/checkpoints/' os.makedirs(checkpoint_path, exist_ok=True) with open(f'{checkpoint_path}{checkpoint_name}.pkl', 'wb') as f: pickle.dump(data, f) print(f"Checkpoint saved: {checkpoint_name}")
Solution: Use safe installation with error handling
def install_package_safely(package_name, version=None): """Safely install packages with error handling""" try: __import__(package_name) print(f"{package_name} is already installed") return True except ImportError: if version: package_spec = f"{package_name}=={version}" else: package_spec = package_name !pip install {package_spec} return True
Start coding immediately in your browser
Google's introduction to Colaboratory
Common questions and troubleshooting
Browse available packages for installation
You've completed the comprehensive Google Colab guide from beginner to pro level!