My Notes


Nothing special, only my notes.

My notes now are on Hashnode, you can find them here: https://massyfigini.hashnode.dev/



# Begin with #!/ and the bash location
#!/usr/bash

# extension .sh

# two possibility for run a bash script:
bash script.sh
./script.sh

bash script.sh One Two Three
# One, Two and Three are arguments. You can use then in the script
echo $1     # print first argument, One
echo $3     # print third argument, Three
echo $*     # print all the arguments
echo $#     # print arguments count, 3


# Variables

var1="hello"     # create a variable (no space around =)
$var1     # reference the variable
echo "good morning, $var1 and hi"

# Variable in variable
var2="Hi equal to $(var1)"


# Array

myarray=(1 2 3 4 5)

echo ${myarray[@]}     # return all elements

echo ${myarray[@]:1:3}     # return three elements starting from second 

echo ${#myarray[@]}     # return length

echo ${myarray[2]}     # return tird element (zero-indexing)

myarray[0]=10     # change first element

myarray+=(8)     # add a new element at the end


# IF statement

if [ condition ]; then
    # do something...
else
    # do something...
fi

if [ $x == "hello" ]; then
    echo "hello!"
else
    echo "hi!"
fi

# -eq -> equal to
# -ne -> not equal to
# -gt -> greater then
# -lt -> less then
# -ge -> greater  thenor equal to
# -le -> less then or equal to
# -e -> if file exists
# -s -> if file exists and has size greater then 0
# -r -> if file exists and is readable
# -w -> if file exists and is writable
# || -> OR
# && -> AND


# FOR loops

for x in 1 2 3
do
    # do something
done

for x in {1..5..2}     # {START..STOP..INCREMENT}
do
    echo $x
done

for ((x=2;x<=8;x+=2))     # ((start;end;increment))
do
    echo $x
done

for file in dir/*.py
do
    echo $file
done
# list all file in dir with the py extension

for file in $(ls dir/ | grep -i 'hello')
do
    echo $file
done
# list all file in dir with the word hello in filename


# WHILE statements

x = 1
while [ $x -le 10]
do
    echo $x
    ((x+=1))
done


# CASE statements

case 'VAR' in
    PATTERN1)
    COMMAND1 ;;
    PATTERN2)
    COMMAND2 ;;
    *)
    DEFAULT COMMAND ;;
esac

case $(cat $1) in
    *hello*)
    mv $1 hello/ ;;
    *hi*)
    mv $2 hi/ ;;
    *ciao*|*hola*)
    rm $1 ;;
    *)
    mv $1 others/ ;;
esac
# move file with name hello, hi in their folder, remove ciao and hola files
# and move all the other file in another folder


# Basic Functions

function_name () {
    # function code
    return # something
}

# alternative syntax:
function function_name {
    # function code
    return # something
}

function_name     # call the function


# Scheduling scripts with Cron

# cron: time-based job scheduler

echo "* * * * * python create_model.py" | crontab

# echo scheduler command into cromtab

# 5 stars to set, one for each time unit:

* * * * * bash script.sh
run every minutes forever

30 13 * * 7 bash script.sh
# run every sunday at 13:30

15,30,45 * * * * bash script.sh
# run every 15, 30 and 45 minutes

*/15 * * * * bash script.sh
# run every 15 minutes

crontab -l     # see job scheduled with cron

crontab -e     # edit your cronjob list



# Downloading Data using curl

https://curl.haxx.se/download.html

curl -O https://websitename.com/file001.txt     
# -O -> download file with it's name

curl -o newname.txt https://websitename.com/file001.txt     
# -o -> download file with new name

curl -O https://websitename.com/*.txt     

curl -O https://websitename.com/file[001-100].txt

curl -O https://websitename.com/file[001-100:10].txt
#download only file010.txt, file020.txt,....,file100.txt

curl -L -C -O https://websitename.com/file[001-100:10].txt
# -L -> redirects the http url if a 300 error code occurs
# -C -> resumes a previous file transfer if it times out

# with curl you can also download data from 20+ protocols


# Downloading Data using Wget

https://www.gnu.org/software/wget/

# same to curl with this unique option flag:
# -b -> background download
# -q -> turn off output
# -c -> resume broken download

wget -bqc https://websitename.com/file001.txt
# you can link the different option flags

wget -i list.txt
# take all the file links in the txt and dowload them
# other option flags must go before -i

wget --limit-rate=200k -i list.txt
# set download limit rate to 200k per second

wget --wait=1.5 -i list.txt
# put a 1.5 second pause between each file download

# with wget you can also download data from (s)frp


# Data Cleaning and Munging with csvkit

pip install csvkit

pip install --upgrade csvkit

in2csv file.xlsx > file.csv     # convert first sheet to csv

in2csv file.xlsx --sheet "NameSheet2" > file.csv    
# convert NameSheet2 sheet to csv

csvlook file.csv     # see csv file content

csvstat file.csv     # print summary statistics of all columns

# csvcut for filter colums

csvcut -n     # see column names and positions

csvcut -c 1 file.csv     # return first column

csvcut -c "colname" file.csv     # return colname column 

csvcut -c 2,3 file.csv     # return second and third columns

csvcut -c "colname","colname2" file.csv     

# csvgrep for filter rows

# must be paired with one of these options:
# -m -> exact row value to filter
# -r -> regex pattern
# -f -> path to a file

csvgrep -c "col" -m value file.csv
# search in file.csv where col column equal to value

csvgrep -c 3 -m value file.csv
# search in file.csv where third column equal to value

# csvstack for stacking multiple csv files

# the file must have same columns, same order, same data types

csvstack file1.csv file2.csv > fileall.csv

csvstack -g "g1","g2" file1.csv file2.csv > fileall.csv
# make a group column with g1 if the record is from file1

csvstack -g "g1","g2" -n "source" file1.csv file2.csv > fileall.csv
# same as before but choose a name for the group column


# Pulling data from database with csvkit

# sql2csv work with SQL Server, MySQL, Oracle, Postgresql, and others

# sql2csv exec a query and put the results in a csv

sql2csv --db "sqlite:///dbname.db" --query "SELECT * FROM table" > file.csv
# extract data from sqlite db to csv

# csvsql uses SQL for query csv files

csvsql --query "SELECT * FROM file" file.csv

csvsql --query "SELECT * FROM file" file.csv | csvlook     # better printout

csvsql --query "SELECT * FROM file" file.csv > result.csv

csvsql --query "SELECT * FROM file JOIN file2 ON ..." file.csv file2.csv
# join two files

sqlquery = "SELECT * FROM file JOIN file2 ON file.col = file2.col"
csvsql --query "$sqlquery" file.csv file2.csv
# use shell variable for store the query

# csvsql uses also for querying a db, manipulate objects on it and push data

csvsql --db "mysql:///dbs" -- insert file.csv
# insert data in a new table called file in the mysql db dbs

csvsql --db "mysql:///dbs" --query "SELECT * FROM file" -- insert file.csv
# use query

# other useful attributes:
# --no-inference -> create all table columns in text format
# --no-constraints -> no length limits or null checks


# Python on the command line

python     # start Python

echo "print('hello world!')"  > helloworld.py

python helloworld.py     # execute the script

pip install --upgrade pip

pip list     # see installed packages

pip install scikit-learn

pip install -r requirements.txt     # install packages listed in file

# cron: time-based job scheduler for scripts

echo "* * * * * python create_model.py" | crontab
# echo scheduler command into cromtab
# * * * * * run job every minutes forever

# * meaning and order
# first * -> minute (0-59)
# second * -> hour (0-23)
# third * -> day of month (1-31)
# fourth * -> month (1-12)
# fifth * -> day of week (0-6, Sunday = 0)

crontab -l     # see job scheduled with cron


pwd     # print working directory

ls     # list content

ls /dir/subdir     # starts with / -> absolute path

ls dir/subdir     # doesn't start with / -> relative path

ls -R    # see also content in the subdirectory

ls -F     # see also a * after runnable programs, / after directories

cd     # change directory

..     # parent directory

cd ..     # go to parent directory

ls ..     # list parent directory content

~     # home directory

cd ~     # go to home directory

ls ~     # list home directory content

cp     # copy files

cp original.txt duplicate.txt     # copy original.txt and name it duplicate.txt

# NB: use quotes if there are spaces in files or directories names

cp original.txt duplicate.txt backup     
# copy the 2 files in the backup directory

mv     # move or rename files or directories

mv backup/original.txt backup/duplicate.txt ..     
# move the 2 files from backup to the parent directory

mv original.txt old.txt     # rename original.txt in old.txt

mv backup backuptwo     # rename the backup directory

rm     # remove files

rm old.txt duplicate.txt     # remove the 2 files

rmdir     # remove a directory only if it's empty

rmdir backuptwo

mkdir     # make a new directory

mkdir backup

cat     # print file content

cat test.txt

less     # display file content one page at time

less test.txt
# now we can use spaceboard for go to next page or q to quit

less test.txt new.txt
# :n for go to the next file, :p for go to the previous one, :q to quit

head     # display first 10 lines

head -n 3 test.txt     # only first 3 lines

# NB: with tab you have auto completion, double tab in case of ambiguity

man     # find out what a command do

man less

cut      # select column from a file

cut -f 2-5,8 -d , file.csv     # -f = fields, -d = delimiter
# select columns from 2 to 5 and 8 using comma as separator

grep     # select a line in file according to what contain

grep hello file.csv     # select lines with hello

# grep patterns
# -c: print a count of matching lines rather than the lines themselves
# -h: do not print the names of files when searching multiple files
# -i: ignore case (e.g., treat "Regression" and "regression" as matches)
# -l: print the names of files that contain matches, not the matches
# -n: print line numbers for matching lines
# -v: invert the match, i.e., only show lines that don't match

grep -n -v hello test.csv     # select lines with lines number without hello

grep -c hello file.csv two.csv     # how many lines with hello in the two files 

sed     # replace

sed 's/hello/hi/g' test.txt     # replace hello with hi in test.txt

>     # redirect output to a file

head -n 3 test.txt > test2.txt

|     # create pipe

head -n 9 test.txt > tail -n2
# take first 9 rows, then take the last two of the result (8-9)
cut -d , -f 1 dir/file.csv | grep -v Date | head -n 10
# 1. select first column of the comma delimited file.csv in dir
# 2. remove lines with "Date" (maybe the header)
# 3. take the first ten lines

wc     # count of the character (-c), words (-w), lines (-l) in a file

grep hello file.csv | wc -l     # num of records with hello

# wildcards
# * -> matches 0 or more characters
# ? -> matches a single character
# [...] -> matches any one of the characters in the brackets
# {...} -> matches any of the comma-separated patterns in the brackets

cut -d , -f 1 dir/*.csv     # first field of all csv files in dir

cut -d , -f 1 dir/c*.csv     # first field of all csv files in dir starting with c

sort     # put data in order (alphabetically without patterns)

# sort patterns
# -r -> reverse alphabetical order
# -r -> sort numerically
# -b -> ignore leading blanks
# -f -> case insensitive

cut -d , -f 2 dir/file.csv | grep -v hello| sort -r
# take column 2 of the file, only the lines with hello, reverse order

uniq     # remove adjacent duplicate lines

cut -d , -f2 file.csv | sort -r | uniq
# take second column of the file without duplicates

cut -d , -f2 file.csv | sort -r | uniq -c
# same but with count of how often each occours

cut -d , -f2 file.csv | sort -r | uniq -c > new.csv
# same the result in new.csv

# ctrl+c for stop a running program

# some environment variables
# HOME -> User's home directory
# PWD -> Present working directory
# SHELL -> Which shell program is being used
# USER -> User's ID

echo     # prints its argument

echo hello     # print hello

echo $USER      # print the value of the variable

testvar=dir/file.csv    # assign dir/file.csv to the variable testvar

head -n 1 testvar

# expr for numeric calculation but without decimal

expr 1+3

# bc is a calculator program, you can use it in a pipe

echo "5 + 7.5" | bc

echo "scale = 3; 10 / 3" | bc     # scale for how many decimals

# for loops
for [variable] in [list] ; do [body] ; done

for file in dir/*.csv; do head -n 2 $file | tail -n 1; done
# print second line of each csv in directory dir

nano file.txt     # edit file.txt with nano text editor

history     # see your command history

history | tail -n 3 > steps.sh    # save your last 3 steps to file

head -n 1 dir/*.csv > header.sh     # save command in sh file

bash headers.sh     # tell shell to run commands in the file

$@     # all of the command-line parameters given to the script

# Es. if unique-lines.sh contains sort $@ | uniq, executing:
bash unique-lines.sh dir/file.csv
# will run this command:
sort dir/file.csv | uniq
# if you execute this
bash unique-lines.sh dir/file.csv dir/file2.csv 
# it processes both the files

# use $1, $2, and so on to refer to specific command-line parameters
head -n $2 $1 | tail -n 1 | cut -d , -f $3 > get-field.sh
# take a filename, the row to select, the column to select, and print
bash get-field.sh dir/file.csv 4 2

# you can write for loops in shell scripts without semicolon
# Print the first and last data records of each file.
for filename in $@
do
    head -n 2 $filename | tail -n 1
    tail -n 1 $filename
done

# in shell scripts use # for comments

# use \ to go to new line



# 1.  Importing Data from Flat Files and Spreadsheets

# read_csv for all flat files
import pandas as pd
data = pd.read_csv('file.csv')
data = pd.read_csv('file.tsv', sep='\t')

# choose columns to load by name
col_names = ['STATEFIPS', 'STATE', 'zipcode', 'agi_stub', 'N1']
data = pd.read_csv('file.csv', usecols=col_names)

# choose columns to load by number
col_nums = [0, 1, 2, 3, 4]
data = pd.read_csv('file.csv', usecols=col_nums)

# limiting rows, skip rows, no header in file
data = pd.read_csv('file.csv', nrows=1000, skiprows=1000, header=None)

# assigning column names
colnames = list(another_dataframe)
data = pd.read_csv('file.csv', header=None, names=colnames)

# specifyinf data types
data = pd.read_csv('file.csv', dtype={'zipcode': str})

# customizing missing data values (0 is NaN in the example)
data = pd.read_csv('file.csv', na_values={'zipcode' : 0})

# lines with errors
# with error_bad_lines skip lines with error
# with warn_bad_lines return a warning for bad lines
data = pd.read_csv('file.csv', error_bad_lines=False, warn_bad_lines=True)

# read_excel for excel files
data = pd.read_excel('file.xlsx')   # reads only the first sheet

# many parameters in common with read_csv
# nrows
# skiprows
# usecols: you can specify a range of letters of the spreadsheet file
# in the example I import only columns from W to AB plus AR
data = pd.read_excel('file.xlsx', usecols="W:AB, AR")

# two possibilities for taking the second sheet named sheet2
data = pd.read_excel('file.xlsx', sheet_name=1)   # zero-indexed
data = pd.read_excel('file.xlsx', sheet_name='sheet2')

# import first two sheets
data = pd.read_excel('file.xlsx', sheet_name=[0,1])   
data = pd.read_excel('file.xlsx', sheet_name=['sheet1','sheet2'])

# take all the sheets of a file (one data frame per sheet)
data = pd.read_excel('file.xlsx', sheet_name=None)

# with dtype specify column types
# true_values and false_values useful for convert boolean
data = pd.read_excel('file.xlsx',
                                    dtype = {'col5': bool,
                                                   'col6': bool}
                                    true_values = ["Yes"]
                                    false_values = ["No"]})
# NB: blank values are setting as True!

# for setting datetime columns not dtypes but parse_dates
# if is a standard datetime is enough this:
data = pd.read_excel('file.xlsx',
                                    parse_dates = ['col1','col2'])

# for non standard column date, es. 311299 for 31/12/1999
format_string = '%d%m%y'


# 2.  Importing Data from Databases

import pandas as pd

# library to connect to databases
from sqlalchemy import create_engine

# create the database engine for the db data.db on sqlite
engine = create_engine('sqlite:///data.db')

# view the tables in the database
print(engine.table_names())

# load tab table without any SQL
tabloaded = pd.read_sql('tab', engine)

# create a SQL query to load data from a query
query = 'SELECT col1, col2 FROM tab WHERE col1 > 0'

# load weather with the SQL query
tabloaded = pd.read_sql(query, engine)


# 3.  Importing JSON Data and Working with APIs

import pandas as pd

# load a JSON file to a data frame
jsondata = pd.read_json('file.json')

# we can specify the orientation of the JSON 
# 'split' : dict like {index -> [index], columns -> [columns], data -> [values]}
# 'records' : list like [{column -> value}, ... , {column -> value}]
# 'index' : dict like {index -> {column -> value}}
# 'columns' : dict like {column -> {index -> value}}
# 'values' : just the values array
jsondata = pd.read_json('file.json', orient = 'split')

# import data from an API
# requests library used for any url
import requests

# example: using the Yelp API
api_url = "https://api.yelp.com/v3/businesses/search"

# set up parameter dictionary according to documentation
params = {"term": "bookstore", "location": "San Francisco"}

# set up header dictionary w/ API key according to documentation
headers = {"Authorization": "Bearer {}".format(api_key)}

# call the API
response = requests.get(api_url, params=params ,headers=headers)

# isolate the JSON data from the response object
data = response.json()

# load businesses data to a data frame
df = pd.DataFrame(data["businesses"])

# library for reading nested json
from pandas.io.json import json_normalize

# flatten data and load to data frame, with _ separators
df = json_normalize(data["businesses"], sep="_")

# flatten categories data, bring in business details
df = json_normalize(data["businesses"], sep="_", record_path="categories",
     meta=["name", "alias", "rating", 
               ["coordinates", "latitude"],["coordinates", "longitude"]],
                     meta_prefix="biz_")


# 4.  Combining multiple datasets

import pandas as pd

# append datasets method
appended = df1.append(df2)

# set ignore_index to True to renumber rows
appended = df1.append(df2, ignore_index = True)

# merge(): pandas function and dataframe method for merging datasets
# default is like sql inner join (only values in both dataframes) 
merged = df1.merge(df2, on = 'col1')
merged = df1.merge(df2, left_on = 'col1', right_on = 'col2')



In the previous post (http://notes.massimilianofigini.com/2021/01/how-to-build-and-schedule-python.html) you can find how to create your Twitter developer account and how to schedule a Twitter bot.

Here there is the complete code of my "Covid-19 BOT Italia", a BOT started in April and still active, that automatically retweets all the Covid-19 related tweets by Italian authorities. 
You can see the result here: https://twitter.com/COVIDbot_ITA


You can also find the always updated code on my Github (https://github.com/massyfigini/CovidBOT).

First I have created a CSV file with two fields: Account contains all the Twitter accounts of the Italian authorities (ex. GiuseppeConteIT is the account name of the Italian Prime Minister), LastTweetID contains the ID of the last tweet (numeric, the last part of the link of the tweet).
Then I have created an empty txt file named "Error.txt".
Both files are in the same directory of the Python script.
This is the Python script:

C_KEY = "Replace this sentence with your API Key"  
C_SECRET = "Replace this sentence with your API Key Secret"  
A_TOKEN = "Replace this sentence with your Access Token"  
A_TOKEN_SECRET = "Replace this sentence with your Access Token Secret"  

#pip install tweepy

# import libraries
import tweepy
import pandas as pd

# Authenticate to Twitter
auth = tweepy.OAuthHandler(C_KEY, C_SECRET)
auth.set_access_token(A_TOKEN, A_TOKEN_SECRET)

# Create API object
api = tweepy.API(auth)

# search for covid
text1 = 'coronavirus'
text2 = 'covid'
text3 = 'lockdown'
text4 = 'pandemia'

text = [text1, text2, text3, text4]

# read file
df = pd.read_csv('account.csv')


# for each row of my csv
for index, row in df.iterrows():
    
    # take data
    account = row['Account']
    last_tweet_id = row['LastTweetID']
    
    # tweet limit
    max_tweets = 1000
    
    # take all user tweets
    searched_tweets = [status.id for status in 
                       tweepy.Cursor(api.user_timeline, id = account, since_id = last_tweet_id,
                                     exclude_replies = True).items(max_tweets)]
    
    try:
        # for each tweet
        for idx, i in enumerate(searched_tweets):
            tweet = api.get_status(i, tweet_mode = 'extended')  # all the text
            tweetText = tweet.full_text   # take text
            tweetText = tweetText.lower()   # lowercase
            if idx == 0: df.iloc[index,3] = tweet.id   # last retweeted id
            # filter only covid tweets and exclude retweets
            if not(tweetText.startswith('rt')) and any(x in tweetText for x in text):
                # tweet!    
                api.retweet(i)
             
    except:
        # open a file to append the error
        Err = open("Error.txt""a"encoding="utf-8")
        Err.write(" ACCOUNT: " + account + ". TWEET IN ERROR: " + tweetText)

# rewrite account.csv with last id tweet updated
df.to_csv('account.csv'index=False)

As you can see, for each account in the csv file, the script retweets every tweets that contain some key words starting with the last tweet ID. After that it updates the csv file with the ID of the last tweet. If there is an error, the tweet that generates the problem is wrote in the txt file, so you can easily investigate the problem.


1. Create a Twitter developer account

With this account, you have access to the Twitter API we are going to need to develop the bot. 
Login to your Twitter account and choose "apply for developer account".
Choose "Making a bot", answer the questions, accept the Developer
Agreement and submit the application.

2. Write the Python script

In the developer portal (https://developer.twitter.com/en/portal/dashboard), in the "Overview" of the "Projects & apps" section, choose "Create app", and give it a name. Now you can find your keys and tokens to programming the app.
With your source-code editor, write the code for your bot. You can use the Tweepy package to access almost all Twitter functionality in Python.
To interact with your account, you must use this code:

C_KEY = "Replace this sentence with your API Key"  
C_SECRET = "Replace this sentence with your API Key Secret"  
A_TOKEN = "Replace this sentence with your Access Token"  
A_TOKEN_SECRET = "Replace this sentence with your Access Token Secret"  

# import libraries (if you haven't installed it use "pip install tweepy")
import tweepy

# Authenticate to Twitter
auth = tweepy.OAuthHandler(C_KEY, C_SECRET)
auth.set_access_token(A_TOKEN, A_TOKEN_SECRET)

In my next post I'll give you an example of a complete bot code.

3. Schedule the bot with Task Scheduler

The simplest way to schedule a bot in a Windows PC is through Task Scheduler.
First we have to create a bat file to run the Python script.
Open a notepad file and paste this code:

"C:\[Dir1]\python.exe" "C:\[Dir2]\COVID_TwitterBOT\bot.py"

In the code above, replace [Dir1] with the path of your Python installation, [Dir2] with the path of your Python script, and "bot.py" with the name of your Python script.
Save the text file in the same directory of the Python script and name it "Run_BOT.bat".
If you double click the file, the Python script will be executed.
NB: if you get an error, it's possible you have to add Python to the Windows Path, you can find a guide here https://geek-university.com/python/add-python-to-the-windows-path/.

Now enter Task Scheduler and on the left, under "Task Scheduler Library" right click and choose "Create Task...".
In the "General" tab, give it a name and choose "Run whether user is logged or not".
In the "Triggers" tab, click on "New..." and choose the trigger type. For example if you need to run the script every 5 minutes, choose "Repeat task every" and select "5 minutes" and "Indefinitely" in the duration box. Make sure to tick "Enabled" in the last checkbox, and choose OK.
In the "Actions" tab, click on "New...", choose "Start a Program" in the first box, in the Program/Script box click on "Browse" and select your script, in the "Add arguments" box write the name of your bat file (Run_BOT.bat), and in the "Start in" box write the entire path in which you have the bat file without the file name (es. C:\BotProject). Click ok and then rigth click on the new task you have just created and choose "Run" for the first run.
Now your script will be execute every 5 minutes when your PC is turned on.




PROBLEM: if you try to execute with a SSIS Data Flow a stored procedure containing a temporary table, you receive this error message:

Exception from HRESULT: 0xC020204A
Error at From SQL Server to Excel [OLE DB Source [91]]: SSIS Error Code DTS_E_OLEDBERROR.  An OLE DB error has occurred. Error code: 0x80004005.
An OLE DB record is available.  Source: "Microsoft SQL Server Native Client 11.0"  Hresult: 0x80004005  Description: "The metadata could not be determined because statement 'INSERT INTO #TempTable [...]".
Error at From SQL Server to Excel [OLE DB Source [91]]: Unable to retrieve column information from the data source. Make sure your target table in the database is available.


SOLUTION: It's incredible that on Stackoverflow and various forums, the answer to this problem is "SSIS doesn't get along with temporary tables, you have to use tables variables or CTEs".
You can use temp table, you only have to specify to SSIS the resulting dataset of your stored procedure with the WITH RESULT SETS command:

    EXECUTE YourSP
    WITH RESULT SETS
    ((    
        One bigint NOT NULL, 
        Two varchar(35) NULL, 
        Three char(6) NULL
    ))

That's all.