# Get line nr. XY from a file in a fast way via sed
sed -n '99999p' myfile.fasta
# Get number of sequences in a fasta file
grep -c '>' myfile.fasta
2014-04-10
Convert UCSC Aln
UCSC 100-Way Genome Alignment, downloaded from their website,
is in strange format, hence:
1. Delete all lines not belonging to alignment
2. Execute RegEx, to include a tab between species name and sequence, but not in between different words in a species name
perl -pe "s/(([a-zA-Z-\(\)'\.]+\s){1,})([Nacgt=-]+)/\1\t\3/g"
3. Upload file to Galaxy
4. Tabular-To-Fasta Converter
5. Concatenate Fasta by Species
Done!
is in strange format, hence:
1. Delete all lines not belonging to alignment
2. Execute RegEx, to include a tab between species name and sequence, but not in between different words in a species name
perl -pe "s/(([a-zA-Z-\(\)'\.]+\s){1,})([Nacgt=-]+)/\1\t\3/g"
3. Upload file to Galaxy
4. Tabular-To-Fasta Converter
5. Concatenate Fasta by Species
Done!
2014-04-06
Script to compute bp of a band in a gel electrophoresis picture
mm - milimeter from where sample was put into gele
bp - base pair
You need:
# R Script file get_bp_from_mm.R:
getbpfrommm <- function(marker, unknown, unknown.filename){
# log bp of marker
markerlog <- marker
markerlog$bp <- log(markerlog$bp)
# make linear model
marker.lm <- lm(markerlog$bp ~ markerlog$mm)
b=coef(marker.lm)[1]
m=coef(marker.lm)[2]
unknown.bp <- c()
for(unknown.mm.i in unknown){
unknown.bp.i = round(exp(m*unknown.mm.i+b))
unknown.bp <- c(unknown.bp, unknown.bp.i)}
unknown.bp.mm <- cbind(unknown, unknown.bp)
colnames(unknown.bp.mm) <- c("mm", "bp")
print(unknown.bp.mm)
write.table(unknown.bp.mm, file=paste(unknown.filename, "with_bp.csv", sep="_"), quote=FALSE, sep="\t", row.names=FALSE, col.names=TRUE)
}
#read command line arguments
args <- commandArgs(trailingOnly = TRUE)
marker <- read.delim(args[1])
unknown <- read.delim(args[2])
getbpfrommm(marker, unknown, args[2])
# Invoke from command line
$ cat marker.csv
mm bp
132.60 75
116.80 200
106.20 300
95.60 400
89.20 500
72.20 700
62.80 1000
48.30 1500
38.40 2000
19.1 5000
$ cat sample.csv
mm
103.4
98.8
$ Rscript get_bp_from_mm.R marker.csv sample.csv
mm bp
1 103.4 272
2 98.8 317
$ cat sample.csv_with_bp.csv
mm bp
103.4 272
98.8 317
bp - base pair
You need:
- Text file with marker sizes and distances (marker.csv)
- Text file with distance of your sample (sample.csv)
# R Script file get_bp_from_mm.R:
getbpfrommm <- function(marker, unknown, unknown.filename){
# log bp of marker
markerlog <- marker
markerlog$bp <- log(markerlog$bp)
# make linear model
marker.lm <- lm(markerlog$bp ~ markerlog$mm)
b=coef(marker.lm)[1]
m=coef(marker.lm)[2]
unknown.bp <- c()
for(unknown.mm.i in unknown){
unknown.bp.i = round(exp(m*unknown.mm.i+b))
unknown.bp <- c(unknown.bp, unknown.bp.i)}
unknown.bp.mm <- cbind(unknown, unknown.bp)
colnames(unknown.bp.mm) <- c("mm", "bp")
print(unknown.bp.mm)
write.table(unknown.bp.mm, file=paste(unknown.filename, "with_bp.csv", sep="_"), quote=FALSE, sep="\t", row.names=FALSE, col.names=TRUE)
}
#read command line arguments
args <- commandArgs(trailingOnly = TRUE)
marker <- read.delim(args[1])
unknown <- read.delim(args[2])
getbpfrommm(marker, unknown, args[2])
# Invoke from command line
$ cat marker.csv
mm bp
132.60 75
116.80 200
106.20 300
95.60 400
89.20 500
72.20 700
62.80 1000
48.30 1500
38.40 2000
19.1 5000
$ cat sample.csv
mm
103.4
98.8
$ Rscript get_bp_from_mm.R marker.csv sample.csv
mm bp
1 103.4 272
2 98.8 317
$ cat sample.csv_with_bp.csv
mm bp
103.4 272
98.8 317
2014-04-02
sed to edit annotation line of NCBI RefSeq fasta file
cat test.fa
>gi|254939587|ref|NG_012567.1| Homo sapiens NADPH oxidase 1 (NOX1), RefSeqGene on chromosome X
ATTCTGTGATCACCAGCTTATCAAAAGACTTCCTAGTACTCTGATATTGGGAATGGGGGTCCTACCTCACAGACATAAGG
GTCCAATCAGCATGGCATATATAATTCTTTAGATAATACATAAATTGTCATCCAGATTATAGATCATTCTTTTATGAATC
ACAGGATCTCAATGTTGGAGTATATTTAAGGGACATTTAGTTAACCATCTACCTGGTGCTGATATTCCCCTTATAAAAAG
CTGACAAGGGGTTGTCCATTTTTCCTTGAGAGTCTCAAGTAATAGGAAACTCATTACCTCTTTACTTCTCTGAATACCCG
TGTTGGAAAATTCTGCTTTATATTGAAAAAAAATTGTGTTACTTTATTTATTTTTATTTTTATTTTTTGACACAGAATCT
TATTCTTTCGCCCAGTCTGGAGTGCAGTGGCGTGACCTTGGCTCACCACAACCCTCGCCTCCTGGATTCAAGCGATTCAA
GTGCCTCAGCCTCCCGAGTAGCTGGGACTACAGGCATGCACCACCATGCCAGGCTAATTTTTGTATTTTTGGTAGAGATG
GGGGTTTGCCATGTTGGCCAGCCAGGCTTGTCTCAAACTCCTGACCCCAGGTGATCTGCCCGCCTCAGCCTCCCAAAGTG
CTGGGATTATAGGCGTGAGCCACCATGCCCAACCTAAAGTGTGTTACTTTAGAGCCTCTATTCCTTGGTTGACTCCTTGG
sed -e'/>/s/^[^|]*\|\(.*\)\|\(.*\)\|.*$/>\2/g' test.fa | sed 's/\.[0-9]*//g' > test2.fa
cat test2.fa
>NG_012567
ATTCTGTGATCACCAGCTTATCAAAAGACTTCCTAGTACTCTGATATTGGGAATGGGGGTCCTACCTCACAGACATAAGG
GTCCAATCAGCATGGCATATATAATTCTTTAGATAATACATAAATTGTCATCCAGATTATAGATCATTCTTTTATGAATC
ACAGGATCTCAATGTTGGAGTATATTTAAGGGACATTTAGTTAACCATCTACCTGGTGCTGATATTCCCCTTATAAAAAG
CTGACAAGGGGTTGTCCATTTTTCCTTGAGAGTCTCAAGTAATAGGAAACTCATTACCTCTTTACTTCTCTGAATACCCG
TGTTGGAAAATTCTGCTTTATATTGAAAAAAAATTGTGTTACTTTATTTATTTTTATTTTTATTTTTTGACACAGAATCT
TATTCTTTCGCCCAGTCTGGAGTGCAGTGGCGTGACCTTGGCTCACCACAACCCTCGCCTCCTGGATTCAAGCGATTCAA
GTGCCTCAGCCTCCCGAGTAGCTGGGACTACAGGCATGCACCACCATGCCAGGCTAATTTTTGTATTTTTGGTAGAGATG
GGGGTTTGCCATGTTGGCCAGCCAGGCTTGTCTCAAACTCCTGACCCCAGGTGATCTGCCCGCCTCAGCCTCCCAAAGTG
CTGGGATTATAGGCGTGAGCCACCATGCCCAACCTAAAGTGTGTTACTTTAGAGCCTCTATTCCTTGGTTGACTCCTTGG
>gi|254939587|ref|NG_012567.1| Homo sapiens NADPH oxidase 1 (NOX1), RefSeqGene on chromosome X
ATTCTGTGATCACCAGCTTATCAAAAGACTTCCTAGTACTCTGATATTGGGAATGGGGGTCCTACCTCACAGACATAAGG
GTCCAATCAGCATGGCATATATAATTCTTTAGATAATACATAAATTGTCATCCAGATTATAGATCATTCTTTTATGAATC
ACAGGATCTCAATGTTGGAGTATATTTAAGGGACATTTAGTTAACCATCTACCTGGTGCTGATATTCCCCTTATAAAAAG
CTGACAAGGGGTTGTCCATTTTTCCTTGAGAGTCTCAAGTAATAGGAAACTCATTACCTCTTTACTTCTCTGAATACCCG
TGTTGGAAAATTCTGCTTTATATTGAAAAAAAATTGTGTTACTTTATTTATTTTTATTTTTATTTTTTGACACAGAATCT
TATTCTTTCGCCCAGTCTGGAGTGCAGTGGCGTGACCTTGGCTCACCACAACCCTCGCCTCCTGGATTCAAGCGATTCAA
GTGCCTCAGCCTCCCGAGTAGCTGGGACTACAGGCATGCACCACCATGCCAGGCTAATTTTTGTATTTTTGGTAGAGATG
GGGGTTTGCCATGTTGGCCAGCCAGGCTTGTCTCAAACTCCTGACCCCAGGTGATCTGCCCGCCTCAGCCTCCCAAAGTG
CTGGGATTATAGGCGTGAGCCACCATGCCCAACCTAAAGTGTGTTACTTTAGAGCCTCTATTCCTTGGTTGACTCCTTGG
sed -e'/>/s/^[^|]*\|\(.*\)\|\(.*\)\|.*$/>\2/g' test.fa | sed 's/\.[0-9]*//g' > test2.fa
cat test2.fa
>NG_012567
ATTCTGTGATCACCAGCTTATCAAAAGACTTCCTAGTACTCTGATATTGGGAATGGGGGTCCTACCTCACAGACATAAGG
GTCCAATCAGCATGGCATATATAATTCTTTAGATAATACATAAATTGTCATCCAGATTATAGATCATTCTTTTATGAATC
ACAGGATCTCAATGTTGGAGTATATTTAAGGGACATTTAGTTAACCATCTACCTGGTGCTGATATTCCCCTTATAAAAAG
CTGACAAGGGGTTGTCCATTTTTCCTTGAGAGTCTCAAGTAATAGGAAACTCATTACCTCTTTACTTCTCTGAATACCCG
TGTTGGAAAATTCTGCTTTATATTGAAAAAAAATTGTGTTACTTTATTTATTTTTATTTTTATTTTTTGACACAGAATCT
TATTCTTTCGCCCAGTCTGGAGTGCAGTGGCGTGACCTTGGCTCACCACAACCCTCGCCTCCTGGATTCAAGCGATTCAA
GTGCCTCAGCCTCCCGAGTAGCTGGGACTACAGGCATGCACCACCATGCCAGGCTAATTTTTGTATTTTTGGTAGAGATG
GGGGTTTGCCATGTTGGCCAGCCAGGCTTGTCTCAAACTCCTGACCCCAGGTGATCTGCCCGCCTCAGCCTCCCAAAGTG
CTGGGATTATAGGCGTGAGCCACCATGCCCAACCTAAAGTGTGTTACTTTAGAGCCTCTATTCCTTGGTTGACTCCTTGG
Subscribe to:
Posts (Atom)