vartak-results-data


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176

#!/bin/sh

help() { echo "vartak-results-data - convert vartak results pdf into CSV data files

USAGE:
	vartak-results-data [OPTION]... <FILE>

OPTIONS:
	-d DEST   produce data in the specified DESTination path/directory
	-t        top header
	-h        show this help message"; }

warn() { printf "WARNING: %s\n" "$@" >&2; }
err() { printf "vartak-result-data: %b\n" "$@" >&2; exit 1; }

while getopts 'td:h' o; do case "$o" in
	d) export DEST="$OPTARG" ;;
	t) tflag=1 ;;
	h) help; exit ;;
	*) err "invalid option -- '$OPTARG'" ;;
esac done
shift $((OPTIND - 1))

[ "$#" -lt 1 ] && help >&2 && exit 1

[ "${1#*ATKT}" != "$1" ] && err "ATKT files are not supported"

filetype="$(file --mime-type --brief "$1")"
case "$filetype" in
	application/pdf) pdftotext -layout "$1"; file="${1%.pdf}.txt"; tmp=1 ;;
	text/plain) file="$1" ;;
	*) err "only PDF and text files are supported\n$1 :-\n\t$filetype" ;;
esac

file="$(realpath "$file")"
printf "\n:: Preparing data in : %s\n" "${DEST:=$PWD}"
mkdir -pv "$DEST"
cd "$DEST" || exit

if [ "$tflag" = 1 ]; then
	header="$(sed -nE "0,/^Seat .*Name \s{2,}(.*Total).*$/ s//\1/p" "$file" |
		sed -E 's/\s?\[ [0-9]+(\s\])?\s*/,/g; s/,$//')"
else
	header="$(sed -nE '0,/^[0-9]{5} [/A-Z ]*\s{2,}(.*)/ s//\1/p' "$file" |
		sed -E 's/\s?\[ [0-9]+(\s\])?\s*/,/g; s/,$//')"
fi

if [ -f names.csv ]; then
	echo "names.csv already exists, skipping..."
else
	echo "Seat No,Name" >> names.csv
	awk -f - "$file" >> names.csv << EOF
		/^[0-9]{5}/ {
			name = ""
			for (i = 2; \$i !~ /^(${header%%[ ,]*}|INTER|TW)/; i++) { name = name " " \$i }
			sub(/^\s*/, "", name)
			print \$1 "," name
		}
EOF
fi

header="Seat No,$header"

if [ -f marks.csv ]; then
	echo "marks.csv already exists, skipping..."
else
	echo "$header" > marks.csv
	awk '
		/^[0-9]{5}/ { printf("%d", $1) }
		!/^[0-9]/ && !/Seat/ && !/Total\s*\[/ && /Total/ {
			start = 1
			while ($start != "Total") start++
			start++
			for (i = start; i <= NF; i++) {
				if ($i ~ /^AB$/) {
					printf(",%s", "0")
				} else if ($i ~ /^[0-9]+$/) {
					printf(",%d", $i)
				}
			}
			printf("\n")
		}
	' "$file" >> marks.csv
fi

if [ -f GP.csv ]; then
	echo "GP.csv already exists, skipping..."
else
	echo "${header%,Total}" > GP.csv
	awk '
		/^[0-9]{5}/ { printf("%d", $1) }
		/ GP / {
			start = 1
			while ($start != "GP") start++
			start++
			for (i = start; i <= NF; i++) {
				if ($i ~ /F/) {
					printf(",%s", "0")
				} else if ($i ~ /^[0-9]+$/) {
					printf(",%d", $i)
				}
			}
			printf("\n")
		}
	' "$file" >> GP.csv
fi

if [ -f CGP.csv ]; then
	echo "CGP.csv already exists, skipping..."
else
	echo "$header,CGPA" > CGP.csv
	awk '
		/^[0-9]{5}/ { printf("%d", $1) }
		/ GPA / { gpa = $NF }
		/ CG / {
			start = 1
			while ($start != "CG") start++
			start++
			for (i = start; i <= NF; i++) {
				if ($i ~ /F/) {
					printf(",%s", "0")
				} else if ($i ~ /^[0-9]+$/) {
					printf(",%d", $i)
				}
			}
			printf(",%.2f\n", gpa)
		}
	' "$file" >> CGP.csv
fi

rows() {
	[ -z "$(sed -n '1!d; /^Seat No,/p' "$1")" ] &&
		warn "missing header in file: $1";
	grep -cv 'Seat No' "$1"
}

# no. of rows for each files
nr_names="$(rows names.csv)"
nr_marks="$(rows marks.csv)"
nr_GP="$(rows GP.csv)"
nr_CGP="$(rows CGP.csv)"

if [ "$nr_names" != "$nr_marks" ] ||
		[ "$nr_names" != "$nr_GP" ] ||
		[ "$nr_names" != "$nr_CGP" ]; then
	warn "inconsitent number of rows"
fi

cols() { awk -F, '
		NR == 1 { min = NF }
		{ if (NF > max) max = NF; if (NF < min) min = NF }
		END {
			if (max != min)
				print "WARNING: inconsitent columns in file: " FILENAME > "/dev/stderr"
			print max
		}
	' "$1"; }

# no. of columns for each file
nc_names="$(cols names.csv)"
nc_marks="$(cols marks.csv)"
nc_GP="$(cols GP.csv)"
nc_CGP="$(cols CGP.csv)"

[ "$nc_names" != 2 ] &&
	warn "names data doesn't have exactly 2 columns"
[ "$nc_marks" != "$(( nc_GP + 1))" ] &&
	warn "marks data doesn't have an additional column GP data"
[ "$nc_CGP" != "$(( nc_marks + 1 ))" ] &&
	warn "CGP data doesn't have an additional column to marks data"

printf ":: Finished\n"

# clean up
[ "$tmp" = 1 ] && rm -f "$file"