#!/usr/bash
# extension .sh
bash script.sh
./script.sh
bash script.sh One Two Three
# One, Two and Three are arguments. You can use then in the script
echo $1 # print first argument, One
echo $3 # print third argument, Three
echo $* # print all the arguments
echo $# # print arguments count, 3
var1="hello" # create a variable (no space around =)
$var1 # reference the variable
echo "good morning, $var1 and hi"
# Variable in variable
var2="Hi equal to $(var1)"
myarray=(1 2 3 4 5)
echo ${myarray[@]} # return all elements
echo ${myarray[@]:1:3} # return three elements starting from second
echo ${#myarray[@]} # return length
echo ${myarray[2]} # return tird element (zero-indexing)
myarray[0]=10 # change first element
myarray+=(8) # add a new element at the end
# IF statement
if [ condition ]; then
# do something...
else
# do something...
fi
echo "hello!"
else
echo "hi!"
fi
# -eq -> equal to
# -ne -> not equal to
# -gt -> greater then
# -lt -> less then
# -ge -> greater thenor equal to
# -le -> less then or equal to
# -e -> if file exists
# -s -> if file exists and has size greater then 0
# -r -> if file exists and is readable
# -w -> if file exists and is writable
# || -> OR
# && -> AND
# FOR loops
for x in 1 2 3
do
# do something
done
for x in {1..5..2} # {START..STOP..INCREMENT}
do
echo $x
done
for ((x=2;x<=8;x+=2)) # ((start;end;increment))
do
echo $x
done
for file in dir/*.py
do
echo $file
done
# list all file in dir with the py extension
do
echo $file
done
# list all file in dir with the word hello in filename
# WHILE statements
while [ $x -le 10]
do
echo $x
((x+=1))
done
# CASE statements
case $(cat $1) in
*hello*)
mv $1 hello/ ;;
*hi*)
mv $2 hi/ ;;
*ciao*|*hola*)
rm $1 ;;
*)
mv $1 others/ ;;
esac
# move file with name hello, hi in their folder, remove ciao and hola files
# and move all the other file in another folder
# Basic Functions
function_name () {
# function code
return # something
}
function function_name {
# function code
return # something
}
function_name # call the function
# Scheduling scripts with Cron
# cron: time-based job scheduler
echo "* * * * * python create_model.py" | crontab
# echo scheduler command into cromtab
# 5 stars to set, one for each time unit:
* * * * * bash script.sh
run every minutes forever
# run every sunday at 13:30
15,30,45 * * * * bash script.sh
# run every 15, 30 and 45 minutes
*/15 * * * * bash script.sh
# run every 15 minutes
crontab -l # see job scheduled with cron
crontab -e # edit your cronjob list
# Downloading Data using curl
https://curl.haxx.se/download.html
curl -O https://websitename.com/file001.txt
# -O -> download file with it's name
curl -o newname.txt https://websitename.com/file001.txt
# -o -> download file with new name
curl -O https://websitename.com/*.txt
curl -O https://websitename.com/file[001-100].txt
curl -O https://websitename.com/file[001-100:10].txt
#download only file010.txt, file020.txt,....,file100.txt
# -L -> redirects the http url if a 300 error code occurs
# -C -> resumes a previous file transfer if it times out
# with curl you can also download data from 20+ protocols
# Downloading Data using Wget
https://www.gnu.org/software/wget/
# -b -> background download
# -q -> turn off output
# -c -> resume broken download
wget -bqc https://websitename.com/file001.txt
# you can link the different option flags
# take all the file links in the txt and dowload them
wget --limit-rate=200k -i list.txt
# set download limit rate to 200k per second
wget --wait=1.5 -i list.txt
# put a 1.5 second pause between each file download
# with wget you can also download data from (s)frp
# Data Cleaning and Munging with csvkit
pip install csvkit
pip install --upgrade csvkit
in2csv file.xlsx > file.csv # convert first sheet to csv
# convert NameSheet2 sheet to csv
csvlook file.csv # see csv file content
csvstat file.csv # print summary statistics of all columns
# csvcut for filter colums
csvcut -n # see column names and positions
csvcut -c 1 file.csv # return first column
csvcut -c "colname" file.csv # return colname column
csvcut -c 2,3 file.csv # return second and third columns
csvcut -c "colname","colname2" file.csv
# csvgrep for filter rows
# -m -> exact row value to filter
# -r -> regex pattern
# -f -> path to a file
csvgrep -c "col" -m value file.csv
# search in file.csv where col column equal to value
csvgrep -c 3 -m value file.csv
# search in file.csv where third column equal to value
# csvstack for stacking multiple csv files
# the file must have same columns, same order, same data types
csvstack file1.csv file2.csv > fileall.csv
# make a group column with g1 if the record is from file1
csvstack -g "g1","g2" -n "source" file1.csv file2.csv > fileall.csv
# same as before but choose a name for the group column
# Pulling data from database with csvkit
# sql2csv work with SQL Server, MySQL, Oracle, Postgresql, and others
# sql2csv exec a query and put the results in a csv
# extract data from sqlite db to csv
# csvsql uses SQL for query csv files
csvsql --query "SELECT * FROM file" file.csv
csvsql --query "SELECT * FROM file" file.csv | csvlook # better printout
csvsql --query "SELECT * FROM file" file.csv > result.csv
# join two files
sqlquery = "SELECT * FROM file JOIN file2 ON file.col = file2.col"
csvsql --query "$sqlquery" file.csv file2.csv
# use shell variable for store the query
# csvsql uses also for querying a db, manipulate objects on it and push data
csvsql --db "mysql:///dbs" -- insert file.csv
# insert data in a new table called file in the mysql db dbs
# use query
# other useful attributes:
# --no-inference -> create all table columns in text format
# --no-constraints -> no length limits or null checks
# Python on the command line
echo "print('hello world!')" > helloworld.py
python helloworld.py # execute the script
pip install --upgrade pip
pip list # see installed packages
pip install scikit-learn
pip install -r requirements.txt # install packages listed in file
# cron: time-based job scheduler for scripts
# echo scheduler command into cromtab
# * * * * * run job every minutes forever
# * meaning and order
# first * -> minute (0-59)
# second * -> hour (0-23)
# third * -> day of month (1-31)
# fourth * -> month (1-12)
# fifth * -> day of week (0-6, Sunday = 0)
crontab -l # see job scheduled with cron
pwd # print working directory
ls # list content
ls /dir/subdir # starts with / -> absolute path
ls dir/subdir # doesn't start with / -> relative path
ls -R # see also content in the subdirectory
ls -F # see also a * after runnable programs, / after directories
cd # change directory
.. # parent directory
cd .. # go to parent directory
ls .. # list parent directory content
~ # home directory
cd ~ # go to home directory
ls ~ # list home directory content
cp # copy files
cp original.txt duplicate.txt # copy original.txt and name it duplicate.txt
# NB: use quotes if there are spaces in files or directories names
# copy the 2 files in the backup directory
mv # move or rename files or directories
# move the 2 files from backup to the parent directory
mv original.txt old.txt # rename original.txt in old.txt
mv backup backuptwo # rename the backup directory
rm # remove files
rm old.txt duplicate.txt # remove the 2 files
rmdir # remove a directory only if it's empty
rmdir backuptwo
mkdir # make a new directory
mkdir backup
cat # print file content
cat test.txt
less # display file content one page at time
less test.txt
# now we can use spaceboard for go to next page or q to quit
head # display first 10 lines
head -n 3 test.txt # only first 3 lines
# NB: with tab you have auto completion, double tab in case of ambiguity
man # find out what a command do
man less
cut # select column from a file
# select columns from 2 to 5 and 8 using comma as separator
grep # select a line in file according to what contain
grep hello file.csv # select lines with hello
# -c: print a count of matching lines rather than the lines themselves
# -h: do not print the names of files when searching multiple files
# -i: ignore case (e.g., treat "Regression" and "regression" as matches)
# -l: print the names of files that contain matches, not the matches
# -n: print line numbers for matching lines
# -v: invert the match, i.e., only show lines that don't match
grep -n -v hello test.csv # select lines with lines number without hello
grep -c hello file.csv two.csv # how many lines with hello in the two files
sed # replace
sed 's/hello/hi/g' test.txt # replace hello with hi in test.txt
> # redirect output to a file
head -n 3 test.txt > test2.txt
| # create pipe
# take first 9 rows, then take the last two of the result (8-9)
# 1. select first column of the comma delimited file.csv in dir
# 2. remove lines with "Date" (maybe the header)
# 3. take the first ten lines
wc # count of the character (-c), words (-w), lines (-l) in a file
grep hello file.csv | wc -l # num of records with hello
# * -> matches 0 or more characters
# ? -> matches a single character
# [...] -> matches any one of the characters in the brackets
# {...} -> matches any of the comma-separated patterns in the brackets
cut -d , -f 1 dir/*.csv # first field of all csv files in dir
cut -d , -f 1 dir/c*.csv # first field of all csv files in dir starting with c
sort # put data in order (alphabetically without patterns)
# -r -> sort numerically
# -b -> ignore leading blanks
# -f -> case insensitive
cut -d , -f 2 dir/file.csv | grep -v hello| sort -r
# take column 2 of the file, only the lines with hello, reverse order
uniq # remove adjacent duplicate lines
# take second column of the file without duplicates
cut -d , -f2 file.csv | sort -r | uniq -c
# same but with count of how often each occours
cut -d , -f2 file.csv | sort -r | uniq -c > new.csv
# same the result in new.csv
# ctrl+c for stop a running program
# HOME -> User's home directory
# PWD -> Present working directory
# SHELL -> Which shell program is being used
# USER -> User's ID
echo # prints its argument
echo hello # print hello
echo $USER # print the value of the variable
testvar=dir/file.csv # assign dir/file.csv to the variable testvar
head -n 1 testvar
# expr for numeric calculation but without decimal
expr 1+3
# bc is a calculator program, you can use it in a pipe
echo "5 + 7.5" | bc
echo "scale = 3; 10 / 3" | bc # scale for how many decimals
# for loops
for [variable] in [list] ; do [body] ; done
nano file.txt # edit file.txt with nano text editor
history # see your command history
history | tail -n 3 > steps.sh # save your last 3 steps to file
head -n 1 dir/*.csv > header.sh # save command in sh file
bash headers.sh # tell shell to run commands in the file
$@ # all of the command-line parameters given to the script
bash unique-lines.sh dir/file.csv
# will run this command:
sort dir/file.csv | uniq
# use $1, $2, and so on to refer to specific command-line parameters
head -n $2 $1 | tail -n 1 | cut -d , -f $3 > get-field.sh
# take a filename, the row to select, the column to select, and print
bash get-field.sh dir/file.csv 4 2
# Print the first and last data records of each file.
for filename in $@
do
head -n 2 $filename | tail -n 1
tail -n 1 $filename
done
# in shell scripts use # for comments
# use \ to go to new line
# 1. Importing Data from Flat Files and Spreadsheets
import pandas as pd
# choose columns to load by number
# 2. Importing Data from Databases
# 3. Importing JSON Data and Working with APIs
# 4. Combining multiple datasets
In the previous post (http://notes.massimilianofigini.com/2021/01/how-to-build-and-schedule-python.html) you can find how to create your Twitter developer account and how to schedule a Twitter bot.
You can see the result here: https://twitter.com/COVIDbot_ITA
1. Create a Twitter developer account
2. Write the Python script
3. Schedule the bot with Task Scheduler
Exception from HRESULT: 0xC020204A
Error at From SQL Server to Excel [OLE DB Source [91]]: SSIS Error Code DTS_E_OLEDBERROR. An OLE DB error has occurred. Error code: 0x80004005.
An OLE DB record is available. Source: "Microsoft SQL Server Native Client 11.0" Hresult: 0x80004005 Description: "The metadata could not be determined because statement 'INSERT INTO #TempTable [...]".
Error at From SQL Server to Excel [OLE DB Source [91]]: Unable to retrieve column information from the data source. Make sure your target table in the database is available.