Back
Featured image of post 文本处理三剑客之awk

文本处理三剑客之awk

本文介绍 linux 下最常用文本处理工具之一 awk, 从零基础起点出发, 涵盖 awk 编程绝大多数使用命令及场景

awk 工作模式

与 sed 相同, 都是逐行处理

语法格式

  1. awk ‘BEGIN{}pattern{commands}END{} filename’
  2. stdout | awk ‘BEGIN{}pattern{commands}END{}’
语法格式 说明
BEGIN{} 处理文本前执行
pattern 匹配模式
{commands} 处理命令, ;隔开
END{} 处理文本后执行

BEGIN{}, pattern, END{} 多可省略

内置变量

内置变量 含义
$0 整行内容
$1-$n 按分隔符的第 1-n 个字段
NF (Number Field) 当前行的字段个数(多少列)
NR (Number Row) 当前行行号, 从 1 开始计数
FNR (File Number Row) 多文件处理时, 每个文件单独技术, 从 1 开始
FS (Field Separate) 输入字段分隔符, 不指定为空格或 tab
RS (Row Separator) 输入行分隔符, 默认回车
OFS (Output Field Separator) 输出字段分隔符
ORS (Output Row Separator) 输出行分隔符, 默认回车
FILENAME 当前输入文件名
ARGC 命令行参数个数
ARGV 命令行参数数组

基本使用:


$ awk '{print $0}' passwd
$ awk '{print $1,$3}' list
$ awk '{print NF}' list
$ awk '{print NR}' list
$ awk '{print FNR}' list awk.txt

$ awk 'BEGIN{FS=":"}{print $1}' passwd
$ awk 'BEGIN{RS="--"}{print $0}' list
$ awk 'BEGIN{FS=":";OFS="|"}{print $1,$3}' passwd  # 每行输出字段
$ awk 'BEGIN{ORS="--"}{print $0}' passwd

$ awk '{print FILENAME}' passwd
$ awk '{print ARGC}' passwd list  # 3 个参数 awk, passwd, list

格式化输出 printf

格式符 含义
%s 字符串
%d 十进制
%f 浮点数
%x 十六进制
%o 八进制
%e 科学计数法
%c 单个字符

修饰符:

修饰符 含义
- 左对齐
+ 右对齐
# 打印 十六进制与八进制时使用, 在前打印进制标识

示例

# %s: 默认左对齐
# %10s: 默认右对齐
$ awk 'BEGIN{FS=":"}{printf "%s",$7}' passwd
$ awk 'BEGIN{FS=":"}{printf "%10s",$7}' passwd
$ awk 'BEGIN{FS=":"}{printf "%-10s",$7}' passwd

$ awk 'BEGIN{FS=":"}{printf "%d",$3}' passwd
$ awk 'BEGIN{FS=":"}{printf "%0.3f",$3}' passwd
$ awk 'BEGIN{FS=":"}{printf "%x",$3}' passwd
$ awk 'BEGIN{FS=":"}{printf "%#x",$3}' passwd  # 显示 16 进制标识
$ awk 'BEGIN{FS=":"}{printf "%0",$3}' passwd
$ awk 'BEGIN{FS=":"}{printf "%e",$3}' passwd

模式匹配

  1. 正则表达式 (固定写法//)
  2. 关系运算匹配

正则表达式:****

# 含有 root 的行
$ awk 'BEGIN{FS=':'}/root/{print $0}' passwd
# 以 nginx 开头
$ awk 'BEGIN{FS=':'}/^nginx/{print $0}' passwd

关系运算符:

关系运算符 含义
< 数值 小于
> 数值 大于
<= 数值 小于等于
>= 数值 大于等于
== 等于
!= 不等于
~ 匹配正则
!~ 不匹配正则
# 第 3 个字段小于 50
$ awk 'BEGIN{FS=":"}$3<50{print $0}' passwd

# 第 7 个字段为 /bin/bash
$ awk 'BEGIN{FS=":"}$7=="/bin/bash"{print $0}' passwd
$ awk 'BEGIN{FS=":"}$7!="/bin/bash"{print $0}' passwd

# 第 3 个字段包含 3 个及以上数字
$ awk 'BEGIN{FS=":"}$3~/[0-9]{3,}/{print $0}' passwd

逻辑运算符:

逻辑运算符 含义
||
&&
!
$ awk 'BEGIN{FS=":"}$1=="root"||$1=="nginx"{print $0}' passwd

$ awk 'BEGIN{FS=":"}$3<50 && $3 >30 {print $0}' passwd

awk 算数运算

运算符 含义
+
-
*
/
% 取余
^ 或 ** 乘方
x++ ; x– 先返回 x, 后 +/- x
++x ; –x 先 +/- x, 后 返回 x

练习计算课程平均值

# 右对齐
$ awk 'BEGIN{printf "%10s%10s%10s%10s%10s\n","Name", "YuWen","ShuXue", "English", "AVG"}{total=$1+$2+$3;AVG=total/3;printf "%10s%10d%10d%10d%10.2f\n",$1,$2,$3,$4,AVG}' list

$ awk 'BEGIN{printf "%-10s%-10s%-10s%-10s%-10s\n","Name", "YuWen","ShuXue", "English", "AVG"}{total=$1+$2+$3;AVG=total/3;printf "%-10s%-10d%-10d%-10d%-10.2f\n",$1,$2,$3,$4,AVG}' list

条件语句

if-else

示例: script.awk

BEGIN{
	FS=":"
}

{
	if($3<50)
	{
		printf "%-20s%-10s%10d\n","UID<50",$1,$3
	}
	else if($3>50 && $3 <100)
	{
		printf "%-20s%-10s%10d\n","50<UID<100",$1,$3
	}
	else
	{
		printf "%-20s%-10s%10d\n","UID>100",$1,$3
	}
}
$ awk -f script.awk /etc/passwd

循环语句

  1. do-while
  2. while
  3. for

计算 1+2+…100

  1. do-while
BEGIN{

    do{
        sum += i
        i++
    }while(i<=100)

    print sum
}
  1. while
BEGIN{
    while(i<=100)
    {
        sum += i
        i++
    }
    print sum
}
  1. for
BEGIN{
    for(i=0;i<=100;i++)
    {
        sum += i
    }

    print sum
}

练习: 打印平均分大于 70的, 并计算平均分

BEGIN{
    printf "%-10s%-10s%-10s%-10s%-10s\n","Name","YuWen","Math","English","AVG"
}

{
    total = $2 + $3 + $3
    avg = total / 3
    if (avg > 70)
    {
        printf "%-10s%-10d%-10d%-10d%-0.2f\n",$1,$2,$3,$4,avg
        score_yuwen += $2
        score_math += $3
        score_english += $4
        score_avg += avg
        count++
    }

}

END{
    printf "%-10s%-10.2f%-10.2f%-10.2f%-0.2f\n","",score_yuwen/count,score_math/count,score_english/count,score_avg/count
}

字符串函数

函数名 解释 返回值
length(str) 计算字符串长度 长度值
index(str1,str2) 在 str1 中查找 str2 返回位置索引, 从 1 计数
tolower(str) 转小写 转小写后的字符串
toupper(str) 转大写 转大写后的字符串
substr(str,m,n) 从 str m 字符, 截取 n 位(n 可省略) 截取后的子串
split(str,arr,fs) 按 fs 切割字符串, 结果保存到 arr 切割后的子串个数
match(str,RE) 与 index() 类似, 但支持正则(RE) 返回索引位置
sub(RE,RepStr,str) 在 str 中搜索符合 RE 的子串将其替换为 RepStr; 只替换第一个 替换个数
gsub(RE,RepStr,str) 在 str 中搜索符合 RE 的子串将其替换为 RepStr; 替换全部 替换个数

1. 打印 passwd 每个字段长度:

BEGIN{
    FS=":"
}

{
    i=1
    while(i<=NF)
    {
        if(i==NF)
            printf "%d",length($i)
        else
            printf "%d:",length($i)
        i++
    }
    print ""
}

2. 查询"I have a dream"中"ea"索引

$ awk 'BEGIN{str="I have a dream";localtion=index(str,"ea");print localtion}'
# 12
$ awk 'BEGIN{str="I have a dream";localtion=match(str,"ea");print localtion}'
# 12

3. 大小写转换

$ awk 'BEGIN{str="I have a dream";print tolower(str)}'
# i have a dream
$ awk 'BEGIN{str="I have a dream";print toupper(str)}'
# I HAVE A DREAM

4. 切分数组

$ awk 'BEGIN{str="I have a dream";split(str,arr," ");print arr[2]}'
$ awk 'BEGIN{str="I have a dream";split(str,arr);print arr[2]}'  # 默认空格分隔
# have

# 遍历, 不是顺序遍历
$ awk 'BEGIN{str="I have a dream";split(str,arr);for(a in arr) print arr[a]}'
# dream
# I
# have
# a

5. 搜索第一个出现的数字

# 正则必须用 //
$ awk 'BEGIN{str="I have a 123 dream"; print match(str, /[0-9]/)}'
# 10

6. 截取子串

$ awk 'BEGIN{str="I have a 123 dream"; print substr(str,3,7)}'
# have a 

$ awk 'BEGIN{str="I have a 123 dream"; print substr(str,3)}'
# have a 123 dream

7. 替换数字

$ awk 'BEGIN{str="I have a 123 dream 324 hello"; print sub(/[0-9]+/,"$",str); print str}'
# 1
# I have a $ dream 324 hello

$ awk 'BEGIN{str="I have a 123 dream 324 hello"; print gsub(/[0-9]+/,"$",str); print str}'
# 2
# I have a $ dream $ hello

awk 常用选项

选项 说明
-v 参数传递
-f 指定脚本文件
-F 指定分隔符
-V 查看版本

如果变量有空格, 要使用""

$ num=13
$ var="hello world"
$ awk -v num1=$num -v var1=$var 'BEGIN{print num1,var1}'
# awk: fatal: cannot open file `BEGIN{print num1,var1}' for reading (No such file or directory)

$ awk -v num1="$num" -v var1="$var" 'BEGIN{print num1,var1}'
# 13 hello world


$ awk -F ":" '{print $0}' /etc/passwd

awk 与 shell 中数组

shell 中数组

下标从 0 开始

打印数组:

$ arr=("kubernetes" "etcd" "time" "redis")

# 打印数组
$ echo ${arr[@]}
$ echo ${arr[*]}
# kubernetes etcd time redis

# 打印元素
$ echo ${arr[2]}
# time

打印数组/元素长度; 分片访问; 元素操作; 删除元素:

$ arr=("kubernetes" "etcd" "time" "redis")

# 打印数组
$ echo ${#arr[@]}
$ echo ${#arr[*]}
# 4

# 打印元素
$ echo ${#arr[3]}
# 5

# 分片访问
$ echo ${arr[@]:1:3}
# etcd time redis

# 元素赋值
$ arr[2]=mysqlserver
$ echo ${arr[@]}
# kubernetes etcd mysqlserver redis

# 元素内容替换
$ echo ${arr[@]/e/E}
# kubErnetes Etcd mysqlsErver rEdis
$ echo ${arr[@]//e/E}
# kubErnEtEs Etcd mysqlsErvEr rEdis

# 元素删除  *** 通过下标删除后, 被删除的下标的元素为空, 原数组的其他元素下标不变
$ unset arr[0]
$ echo ${arr[@]}
# etcd mysqlserver redis
$ unset arr[0]
$ echo ${arr[@]}
# etcd mysqlserver redis
$ unset arr[1]
$ echo ${arr[@]}
# mysqlserver redis

# 删除数组
$ unset arr

通过下标删除后, 被删除的下标的元素为空, 原数组的其他元素下标不变

数组的遍历

$ arr=("kubernetes" "etcd" "time" "redis")
$ for a in ${arr[@]}; do echo $a; done
# kubernetes
# etcd
# time
# redis

awk 中数组

脚本练习

数据生成脚本:

#!/bin/bash
#

function create_random()
{
    min=$1
    max=$(($2-$min+1))
    num=$(date +%s%N)
    echo $(($num%$max+min))
}

INDEX=1

while true
do
    for user in allen mike jerry tracy han lilei
    do
        COUNT=$RANDOM
        NUM1=`create_random 1 $COUNT`
        NUM2=`expr $COUNT - $NUM1`
        echo "`date '+%Y-%m-%d %H:%M:%S'` $INDEX Batches: user $user insert $COUNT records into database:product table:detail, insert $NUM1 records successfully, failed $NUM2 records" >> ./db.log.`date +%Y%m%d`
        INDEX=`expr $INDEX + 1`
    done
done

数据格式

2023-12-12 02:49:31 1 Batches: user allen insert 25719 records into database:product table:detail, insert 24482 records successfully, failed 1237 records
2023-12-12 02:49:31 2 Batches: user mike insert 32653 records into database:product table:detail, insert 26055 records successfully, failed 6598 records
2023-12-12 02:49:31 3 Batches: user jerry insert 16986 records into database:product table:detail, insert 11636 records successfully, failed 5350 records
2023-12-12 02:49:31 4 Batches: user tracy insert 31899 records into database:product table:detail, insert 9250 records successfully, failed 22649 records
2023-12-12 02:49:31 5 Batches: user han insert 24256 records into database:product table:detail, insert 24033 records successfully, failed 223 records

统计所有成功, 失败, 总共记录数

count.awk:

BEGIN{
    printf "%-10s%-20s%-20s%-20s\n","User","Total","Sucess","Failed"
}

{
    TOTAL[$6]+=$8
    SUCESS[$6]+=$14
    FAILED[$6]+=$18
}

END{
    for(t in TOTAL)
    {
        total += TOTAL[t]
        sucess += SUCESS[t]
        failed += FAILED[t]
        printf "%-10s%-20s%-20s%-20s\n",t,TOTAL[t],SUCESS[t],FAILED[t]
    }
    printf "%-10s%-20s%-20s%-20s\n","",total,sucess,failed
}
$ awk -f count.awk db.log.20231212
User      Total               Sucess              Failed
tracy     6096344             2963340             3133004
allen     6293470             3182865             3110605
mike      5845083             2912982             2932101
jerry     5996178             3080723             2915455
lilei     6217104             3028971             3188133
han       5923975             3089899             2834076
          36372154            18258780            18113374

2. 打印丢失记录的行数

一条记录行中, 总记录数 != 成功记录数 + 失败记录数

$ awk '{if($8 != $14 + $18) print NR}' db.log.20231212