Text processing is one of Linux's greatest strengths. Whether you're analyzing log files, processing CSV data, or manipulating configuration files, mastering these tools will make you incredibly productive at the command line.
These three tools form the foundation of text processing in Linux:
# Basic pattern search
grep "pattern" file.txt
grep "error" /var/log/syslog
# Case-insensitive search
grep -i "ERROR" logfile.txt
# Show line numbers
grep -n "function" script.py
# Count matching lines
grep -c "warning" logfile.txt
# Show only filenames with matches
grep -l "TODO" *.py
# Show filenames without matches
grep -L "completed" *.txt
# Recursive search in directories
grep -r "config" /etc/
grep -r --include="*.conf" "database" /etc/
# Context lines (before and after)
grep -A 3 "error" logfile.txt # 3 lines after
grep -B 2 "error" logfile.txt # 2 lines before
grep -C 5 "error" logfile.txt # 5 lines before and after
# Multiple patterns
grep -E "error|warning|critical" logfile.txt
grep -e "pattern1" -e "pattern2" file.txt
# Invert match (lines NOT containing pattern)
grep -v "debug" logfile.txt
# Whole word matching
grep -w "user" file.txt # Won't match "username"
# Fixed string search (no regex)
grep -F "literal.string" file.txt
# Beginning and end of line
grep "^start" file.txt # Lines starting with "start"
grep "end$" file.txt # Lines ending with "end"
grep "^$" file.txt # Empty lines
# Character classes
grep "[0-9]" file.txt # Lines containing digits
grep "[A-Z]" file.txt # Lines containing uppercase
grep "[aeiou]" file.txt # Lines containing vowels
# Quantifiers
grep "colou\?r" file.txt # "color" or "colour"
grep "go\+d" file.txt # "god", "good", "goood", etc.
grep "ba.*" file.txt # "ba" followed by anything
# Word boundaries
grep "\buser\b" file.txt # Whole word "user"
grep "\<user\>" file.txt # Alternative syntax
# Find all Python functions
grep -n "^def " *.py
# Find IP addresses in log files
grep -E '[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}' access.log
# Find email addresses
grep -E '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}' file.txt
# Find lines with specific length
grep '^.\{80\}' file.txt # Lines exactly 80 characters
grep '^.\{80,\}' file.txt # Lines 80+ characters
# Extract error codes
grep -oE 'HTTP [0-9]{3}' access.log
# Find commented lines (various languages)
grep '^[[:space:]]*#' script.sh # Shell comments
grep '^[[:space:]]*//' code.js # JavaScript comments
grep '^[[:space:]]*<!--' page.html # HTML comments
# Substitute (replace) text
sed 's/old/new/' file.txt # Replace first occurrence per line
sed 's/old/new/g' file.txt # Replace all occurrences (global)
sed 's/old/new/2' file.txt # Replace second occurrence per line
# Case-insensitive substitution
sed 's/old/new/gi' file.txt
# In-place editing (modify original file)
sed -i 's/old/new/g' file.txt
sed -i.bak 's/old/new/g' file.txt # Create backup with .bak extension
# Delete lines
sed '3d' file.txt # Delete line 3
sed '2,5d' file.txt # Delete lines 2-5
sed '/pattern/d' file.txt # Delete lines containing pattern
sed '/^$/d' file.txt # Delete empty lines
# Print specific lines
sed -n '10p' file.txt # Print only line 10
sed -n '1,5p' file.txt # Print lines 1-5
sed -n '/pattern/p' file.txt # Print lines containing pattern
# Add lines
sed '3a\New line after line 3' file.txt
sed '3i\New line before line 3' file.txt
sed '$a\New line at end' file.txt
# Multiple operations
sed -e 's/old1/new1/g' -e 's/old2/new2/g' file.txt
sed 's/old1/new1/g; s/old2/new2/g' file.txt
# Using different delimiters
sed 's|/path/old|/path/new|g' file.txt # Useful for paths
sed 's#old#new#g' file.txt # Alternative delimiter
# Line ranges
sed '1,10s/old/new/g' file.txt # Replace only in lines 1-10
sed '/start/,/end/s/old/new/g' file.txt # Replace between patterns
# Backreferences
sed 's/\(word\)/[\1]/g' file.txt # Wrap "word" in brackets
sed 's/\([0-9]*\)-\([0-9]*\)/\2-\1/g' file.txt # Swap numbers around dash
# Address patterns
sed '/pattern/s/old/new/g' file.txt # Replace only in lines with pattern
sed '/pattern/!s/old/new/g' file.txt # Replace only in lines WITHOUT pattern
# Configuration file editing
sed -i 's/^#port 22/port 22/' /etc/ssh/sshd_config
sed -i 's/DEBUG/INFO/g' app.conf
# CSV processing
sed 's/,/\t/g' data.csv # Convert CSV to TSV
sed '1d' data.csv # Remove header row
# Code formatting
sed 's/[[:space:]]*$//' file.txt # Remove trailing whitespace
sed '/^[[:space:]]*$/d' file.txt # Remove blank lines
# Log processing
sed -n '/ERROR/,/^$/p' logfile.txt # Extract error blocks
sed 's/.*\[\([^]]*\)\].*/\1/' access.log # Extract timestamps
# HTML processing
sed 's/<[^>]*>//g' page.html # Remove HTML tags
sed 's/</</g; s/>/>/g; s/&/\&/g' file.html # Decode HTML entities
awk processes text line by line, splitting each line into fields (columns) by default using whitespace as the delimiter.
# Basic syntax
awk 'pattern { action }' file.txt
# Print specific fields (columns)
awk '{print $1}' file.txt # First field
awk '{print $1, $3}' file.txt # First and third fields
awk '{print $NF}' file.txt # Last field
awk '{print $(NF-1)}' file.txt # Second-to-last field
# Print entire line
awk '{print}' file.txt # Same as cat
awk '{print $0}' file.txt # Explicit way
# Custom field separator
awk -F: '{print $1}' /etc/passwd # Use colon as separator
awk -F',' '{print $2}' data.csv # Use comma for CSV
awk -F'\t' '{print $1}' data.tsv # Use tab for TSV
# Built-in variables
awk '{print NR, NF, $0}' file.txt # Line number, field count, full line
awk '{print FILENAME, FNR}' *.txt # Filename and line number
awk 'END {print NR}' file.txt # Total number of lines
# Pattern matching
awk '/pattern/ {print}' file.txt # Lines containing pattern
awk '/^start/ {print}' file.txt # Lines starting with "start"
awk '$1 == "error" {print}' file.txt # First field equals "error"
awk '$3 > 100 {print}' file.txt # Third field greater than 100
# Ranges
awk '/start/,/end/ {print}' file.txt # Lines between patterns
awk 'NR==5,NR==10 {print}' file.txt # Lines 5 through 10
# Conditional expressions
awk 'length($0) > 80 {print}' file.txt # Lines longer than 80 chars
awk 'NF > 5 {print}' file.txt # Lines with more than 5 fields
awk '$2 ~ /pattern/ {print}' file.txt # Second field matches pattern
awk '$1 !~ /pattern/ {print}' file.txt # First field doesn't match
# Variables and calculations
awk '{sum += $3} END {print sum}' file.txt # Sum third column
awk '{count++} END {print count}' file.txt # Count lines
awk '{sum += $1; count++} END {print sum/count}' file.txt # Average
# Conditionals
awk '{if ($3 > 100) print "High:", $0}' file.txt
awk '{if ($1 == "error") errors++; else others++} END {print errors, others}' file.txt
# Loops
awk '{for(i=1; i<=NF; i++) print i, $i}' file.txt # Print field numbers and values
# Arrays
awk '{count[$1]++} END {for (word in count) print word, count[word]}' file.txt
# Log analysis
awk '/ERROR/ {errors++} /WARNING/ {warnings++} END {print "Errors:", errors, "Warnings:", warnings}' logfile.txt
# CSV processing
awk -F',' '{print $2, $1}' data.csv # Swap first two columns
awk -F',' 'NR>1 {sum+=$3; count++} END {print "Average:", sum/count}' data.csv
# System monitoring
ps aux | awk '{sum += $3} END {print "Total CPU:", sum "%"}'
df -h | awk '$5 > 80 {print $6, $5}' # Show filesystems >80% full
# Text formatting
awk '{printf "%-20s %s\n", $1, $2}' file.txt # Format columns
awk '{gsub(/old/, "new"); print}' file.txt # Global substitution
# Data extraction
awk '/start_marker/,/end_marker/ {if (!/start_marker/ && !/end_marker/) print}' file.txt
# Multiple files processing
awk 'FNR==1{print "Processing", FILENAME} {print NR, $0}' *.txt
# Basic sorting
sort file.txt # Alphabetical sort
sort -n numbers.txt # Numerical sort
sort -r file.txt # Reverse sort
sort -u file.txt # Unique sort (remove duplicates)
# Field-based sorting
sort -k2 file.txt # Sort by second field
sort -k2,2 file.txt # Sort by second field only
sort -k2n file.txt # Numerical sort by second field
sort -t: -k3n /etc/passwd # Sort passwd by UID (third field)
# Advanced sorting
sort -k2,2n -k1,1 file.txt # Sort by second field numerically, then first alphabetically
sort -c file.txt # Check if file is sorted
sort -m sorted1.txt sorted2.txt # Merge sorted files
# Remove duplicates (requires sorted input)
sort file.txt | uniq
# Count occurrences
sort file.txt | uniq -c
# Show only duplicates
sort file.txt | uniq -d
# Show only unique lines (appear once)
sort file.txt | uniq -u
# Case-insensitive uniqueness
sort file.txt | uniq -i
# Check specific fields
sort file.txt | uniq -f 1 # Skip first field when comparing
# Extract by character position
cut -c1-10 file.txt # Characters 1-10
cut -c5- file.txt # Characters 5 to end
cut -c-20 file.txt # Characters 1-20
# Extract by field
cut -d: -f1 /etc/passwd # First field, colon delimiter
cut -d, -f2,4 data.csv # Second and fourth fields
cut -d' ' -f3- file.txt # Third field to end
# Multiple delimiters (using tr first)
tr '[:space:]' ',' < file.txt | cut -d, -f2
# Character replacement
tr 'a-z' 'A-Z' < file.txt # Convert to uppercase
tr 'A-Z' 'a-z' < file.txt # Convert to lowercase
tr ' ' '_' < file.txt # Replace spaces with underscores
# Character deletion
tr -d '0-9' < file.txt # Remove all digits
tr -d '\n' < file.txt # Remove newlines (join lines)
tr -d '[:punct:]' < file.txt # Remove punctuation
# Character squeezing
tr -s ' ' < file.txt # Squeeze multiple spaces to one
tr -s '\n' < file.txt # Remove blank lines
# Complement (everything except)
tr -cd '0-9\n' < file.txt # Keep only digits and newlines
# Count lines, words, characters
wc file.txt # All counts
wc -l file.txt # Lines only
wc -w file.txt # Words only
wc -c file.txt # Characters only
wc -m file.txt # Multibyte characters
# Multiple files
wc *.txt # Count for each file plus total
find . -name "*.py" | xargs wc -l # Total lines in all Python files
# Log analysis pipeline
cat access.log | grep "404" | cut -d' ' -f1 | sort | uniq -c | sort -rn | head -10
# Word frequency analysis
cat book.txt | tr '[:upper:]' '[:lower:]' | tr -d '[:punct:]' | tr ' ' '\n' | sort | uniq -c | sort -rn | head -20
# CSV data processing
cat sales.csv | awk -F',' 'NR>1 {sum+=$3} END {print "Total sales:", sum}' | tee results.txt
# Configuration file processing
grep -v '^#' config.txt | grep -v '^$' | sed 's/[[:space:]]*#.*//' | awk -F'=' '{print $1}' | sort
# Extract email domains
grep -oE '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}' file.txt | sed 's/.*@//' | sort | uniq -c
# Process JSON with text tools (basic)
cat data.json | grep '"name"' | sed 's/.*"name"[[:space:]]*:[[:space:]]*"//' | sed 's/".*//' | sort
# Extract URLs from HTML
grep -oE 'href="[^"]*"' page.html | sed 's/href="//' | sed 's/"//' | sort | uniq
# Process Apache logs
awk '{print $1}' access.log | sort | uniq -c | sort -rn | head -20 # Top IPs
awk '{print $7}' access.log | sort | uniq -c | sort -rn | head -20 # Top pages
awk '$9 >= 400 {print $9, $7}' access.log | sort | uniq -c # Error pages
# Create sample log file
cat > sample.log << EOF
2025-07-19 10:00:01 INFO User login successful: user1
2025-07-19 10:00:15 ERROR Database connection failed
2025-07-19 10:00:30 INFO User login successful: user2
2025-07-19 10:01:45 WARNING High memory usage detected
2025-07-19 10:02:00 ERROR Authentication failed for user1
2025-07-19 10:03:15 INFO User logout: user1
EOF
# Practice tasks
grep "ERROR" sample.log # Find all errors
grep -c "INFO\|ERROR\|WARNING" sample.log # Count log levels
awk '{print $3}' sample.log | sort | uniq -c # Count by log level
sed 's/.*: //' sample.log | grep "user" | sort | uniq # Extract unique users
# Create sample CSV
cat > employees.csv << EOF
Name,Department,Salary,Years
John Doe,Engineering,75000,5
Jane Smith,Marketing,65000,3
Bob Johnson,Engineering,80000,7
Alice Brown,Sales,60000,2
Charlie Wilson,Engineering,90000,10
EOF
# Practice tasks
awk -F',' 'NR>1 {sum+=$3; count++} END {print "Average salary:", sum/count}' employees.csv
awk -F',' '$2=="Engineering" {print $1, $3}' employees.csv
awk -F',' 'NR>1 {dept[$2]+=$3; count[$2]++} END {for(d in dept) print d, dept[d]/count[d]}' employees.csv
# Create messy text file
cat > messy.txt << EOF
This has multiple spaces
And some tabs mixed in
UPPERCASE and lowercase
Special characters: !@#$%^&*()
Numbers123and456letters
EOF
# Clean up tasks
sed 's/[[:space:]]\+/ /g' messy.txt # Fix spacing
tr '[:upper:]' '[:lower:]' < messy.txt # Normalize case
tr -d '[:punct:]' < messy.txt # Remove punctuation
tr -cd '[:alnum:][:space:]' < messy.txt # Keep only letters, numbers, spaces
1. Use appropriate tools: grep for searching, sed for simple substitutions, awk for field processing
2. Minimize pipeline stages: Combine operations when possible
3. Use specific options: grep -F
for literal strings, sort -n
for numbers
4. Process large files efficiently: Use head
or tail
to sample first
# Efficient: single awk command
awk '/pattern/ {gsub(/old/, "new"); print}' largefile.txt
# Less efficient: multiple commands
grep "pattern" largefile.txt | sed 's/old/new/g'
# For very large files, process in chunks
split -l 1000000 hugefile.txt chunk_
for chunk in chunk_*; do
process_chunk "$chunk"
done
# Use streaming when possible
tail -f logfile.txt | grep "ERROR" | awk '{print $1, $2}'
grep "pattern" file # Basic search
grep -i "pattern" file # Case insensitive
grep -r "pattern" dir/ # Recursive
grep -v "pattern" file # Invert match
grep -E "pat1|pat2" file # Extended regex
grep -A3 -B3 "pattern" file # Context lines
sed 's/old/new/g' file # Global replace
sed -i 's/old/new/g' file # In-place edit
sed '5d' file # Delete line 5
sed -n '1,10p' file # Print lines 1-10
sed '/pattern/d' file # Delete matching lines
awk '{print $1}' file # First column
awk -F: '{print $1}' file # Custom delimiter
awk '/pattern/ {print}' file # Pattern matching
awk '{sum+=$1} END {print sum}' file # Sum column
awk 'NF>5 {print}' file # Lines with >5 fields
- Master grep for pattern searching and filtering
These text processing skills form the foundation of data manipulation and analysis in Linux. With practice, you'll find yourself solving complex text processing tasks with elegant one-liners that would require complex programs in other environments.
---
This is Part 6 of our comprehensive Linux mastery series.
Previous: Linux Permissions & Security - Master file permissions and ownership
Next: Package Management - Learn to install, update, and manage software packages
Beginner Foundation:
Intermediate Skills:
Ready to Manage Software? Continue with package management to install and maintain applications!
---
Coming next: Understanding package management - learn how to install, update, and manage software packages across different Linux distributions.