Compare revisions

Toby Hodges · Toby Hodges · Toby Hodges · Toby Hodges · Toby Hodges · Toby Hodges
--- a/_config.yml
+++ b/_config.yml
@@ -7,7 +7,7 @@
 # dc: Data Carpentry
 # lc: Library Carpentry
 # cp: Carpentries (to use for instructor traning for instance)
-carpentry: "swc"
+carpentry: "bio-it"

 # Overall title for pages.
 title: "Intermediate Python"
@@ -29,10 +29,11 @@ kind: "lesson"
 repository: <USERNAME>/<PROJECT>

 # Email address, no mailto:
-email: "team@carpentries.org"
+email: "bio-it@embl.de"

 # Sites.
 amy_site: "https://amy.software-carpentry.org/workshops"
+bio-it-home: "https://bio-it.embl.de"
 carpentries_github: "https://github.com/carpentries"
 carpentries_pages: "https://carpentries.github.io"
 carpentries_site: "https://carpentries.org/"

--- a/_episodes/00-howto.md
+++ b/_episodes/00-howto.md
+---
+title: How to use the lesson template
+teaching: 10
+exercises: 10
+questions:
+- "Questions here"
+objectives:
+- "Together with objectives are shown on the top of the page"
+keypoints:
+- "keypoints are 'take home' messages"
+- "they are shown at the end of the lesson"
+---
+
+## Notes on how to use this lesson template
+
+See the [Lesson Example][lesson-example]
+and [The Carpentries Curriculum Development Handbook][cdh]
+for full details.
+Below should be all the things you need to know right now...
+
+### Creating pages
+
+- Write material with [Markdown][markdown-cheatsheet]
+  - markdown files will be rendered as HTML pages and included in the built site
+- `.md` files in the `_episodes` folder will be added to the Episodes dropdown, page navigation, etc
+- Markdown files must include _front matter_: metadata specified in a YAML header bounded by `---`
+- At minimum, this must include a `title` field
+
+~~~
+---
+title: The Title of the Section
+---
+~~~
+{: .source}
+
+- but really your episodes (lesson sections) should include:
+  - an estimate of time required for teaching & exercises
+  - main questions answered in the section
+  - learning objectives
+  - key points to summarise what's covered in the section (these end are added at the end of the lession section)
+- as an example, below is the front matter for this page
+
+~~~
+---
+title: Syntax Elements & Powerful Functions
+teaching: 20
+exercises: 10
+questions:
+- "What elements of Python syntax might I see in other people's code?"
+- "How can I use these additional features of Python to take my code to the next level?"
+- "What built-in functions and standard library modules are recommended to improve my code?"
+objectives:
+- "write comprehensions to improve code readability and efficiency."
+- "call functions designed to make common tasks easier and faster."
+- "recognise all elements of modern Python syntax and explain their purpose."
+keypoints:
+- "Use comprehensions to efficiently create new iterables with fewer lines of code."
+- "Sets can be extremely useful when comparing collections of objects, and create significantly speed up your code."
+- "The `itertools` module includes many helpful functions for working with iterables."
+- "A decorator is a function that does something to the output of another function."
+---
+~~~
+{: .source}
+
+## Code blocks
+
+code snippets written like this
+
+{% raw %}
+    ~~~
+    print(weight_kg)
+    ~~~
+    {: .language-python}
+    ~~~
+    60.0
+    ~~~
+    {: .output}
+{% endraw %}
+
+will produce formatted blocks like this:
+
+~~~
+print(weight_kg)
+~~~
+{: .language-python}
+~~~
+60.0
+~~~
+{: .output}
+
+## Special blockquotes
+
+- The lesson template also includes a range of styled boxes
+  - examples for exercises and callouts below
+  - see [this section][lesson-example-blockquotes] of The Carpentries Lesson Example for the full list
+
+A callout block written like this
+
+~~~
+> ## Callout block example
+>
+> Write callout blocks as blockquotes,
+> with a styling tag (techincal term is a _class identifier_) at the end.
+>
+> ~~~
+> # you can still include code blocks in the callout
+> weight_lb = 2.2 * weight_kg
+> print(weight_kg_text, weight_kg, 'and in pounds:', weight_lb)
+> ~~~
+> {: .language-python}
+>
+> Use callouts for asides and comments -
+> anything that provides additional detail to the core of your material
+{: .callout}
+~~~
+{: .source}
+
+will be rendered like this:
+
+> ## Callout block example
+>
+> Write callout blocks as blockquotes,
+> with a styling tag (techincal term is a _class identifier_) at the end.
+>
+> ~~~
+> # you can still include code blocks in the callout
+> weight_lb = 2.2 * weight_kg
+> print(weight_kg_text, weight_kg, 'and in pounds:', weight_lb)
+> ~~~
+> {: .language-python}
+>
+> Use callouts for asides and comments -
+> anything that provides additional detail to the core of your material
+{: .callout}
+
+Similarly, exercises written like this
+
+~~~
+> ## Sorting Out References
+>
+> What does the following program print out?
+>
+> ~~~
+> first, second = 'Grace', 'Hopper'
+> third, fourth = second, first
+> print(third, fourth)
+> ~~~
+> {: .language-python}
+>
+> > ## Solution
+> >
+> > This text will only be visible if the solution is expanded
+> > ~~~
+> > Hopper Grace
+> > ~~~
+> > {: .output}
+> {: .solution}
+{: .challenge}
+~~~
+{: .source}
+
+will be rendered like this (note the expandable box containing the solution):
+
+> ## Sorting Out References
+>
+> What does the following program print out?
+>
+> ~~~
+> first, second = 'Grace', 'Hopper'
+> third, fourth = second, first
+> print(third, fourth)
+> ~~~
+> {: .language-python}
+>
+> > ## Solution
+> >
+> > This text will only be visible if the solution is expanded
+> > ~~~
+> > Hopper Grace
+> > ~~~
+> > {: .output}
+> {: .solution}
+{: .challenge}
+
+## Shared link references
+
+- Lastly, the last line in every `.md` file for each page should be
+
+{% raw %}
+`{% include links.md %}`
+{% endraw %}
+
+- This allows us to share link references across the entire site, which makes the links much more maintainable.
+  - link URLs should be put in the `_includes/links.md` file (ideally, arranged alphabetically by reference)
+  - you can then write Markdown links "reference-style" i.e. `[link text to be displayed][reference-id]`, with `[reference-id]: https://link.to.page` in `_includes/links.md`
+
+{% include links.md %}
--- a/_episodes/01-syntax.md
+++ b/_episodes/01-syntax.md
@@ -44,187 +44,38 @@ keypoints:
 - (?) honorable mentions - useful modules
  - `plotnine`

-## Notes on how to use this lesson template

-See the [Lesson Example][lesson-example]
-and [The Carpentries Curriculum Development Handbook][cdh]
-for full details.
-Below should be all the things you need to know right now...
+## Exercises

-### Creating pages
-
- Write material with [Markdown][markdown-cheatsheet]
-  - markdown files will be rendered as HTML pages and included in the built site
- `.md` files in the `_episodes` folder will be added to the Episodes dropdown, page navigation, etc
- Markdown files must include _front matter_: metadata specified in a YAML header bounded by `---`
- At minimum, this must include a `title` field
-
-~~~
---
-title: The Title of the Section
---
-~~~
-{: .source}
-
- but really your episodes (lesson sections) should include:
-  - an estimate of time required for teaching & exercises
-  - main questions answered in the section
-  - learning objectives
-  - key points to summarise what's covered in the section (these end are added at the end of the lession section)
- as an example, below is the front matter for this page
-
-~~~
---
-title: Syntax Elements & Powerful Functions
-teaching: 20
-exercises: 10
-questions:
- "What elements of Python syntax might I see in other people's code?"
- "How can I use these additional features of Python to take my code to the next level?"
- "What built-in functions and standard library modules are recommended to improve my code?"
-objectives:
- "write comprehensions to improve code readability and efficiency."
- "call functions designed to make common tasks easier and faster."
- "recognise all elements of modern Python syntax and explain their purpose."
-keypoints:
- "Use comprehensions to efficiently create new iterables with fewer lines of code."
- "Sets can be extremely useful when comparing collections of objects, and create significantly speed up your code."
- "The `itertools` module includes many helpful functions for working with iterables."
- "A decorator is a function that does something to the output of another function."
---
-~~~
-{: .source}
-
-## Code blocks
-
-code snippets written like this
-
-{% raw %}
-    ~~~
-    print(weight_kg)
-    ~~~
-    {: .language-python}
-    ~~~
-    60.0
-    ~~~
-    {: .output}
-{% endraw %}
-
-will produce formatted blocks like this:
-
-~~~
-print(weight_kg)
-~~~
-{: .language-python}
-~~~
-60.0
-~~~
-{: .output}
-
-## Special blockquotes
-
- The lesson template also includes a range of styled boxes
-  - examples for exercises and callouts below
-  - see [this section][lesson-example-blockquotes] of The Carpentries Lesson Example for the full list
-
-A callout block written like this
-
-~~~
-> ## Callout block example
+> ## Yield or return to battle
 >
-> Write callout blocks as blockquotes,
-> with a styling tag (techincal term is a _class identifier_) at the end.
+> Is the following definition valid Python?
+> What kind of function is it?
+> Is the number `2` accessible somehow?
 >
 > ~~~
-> # you can still include code blocks in the callout
-> weight_lb = 2.2 * weight_kg
-> print(weight_kg_text, weight_kg, 'and in pounds:', weight_lb)
-> ~~~
-> {: .language-python}
->
-> Use callouts for asides and comments -
-> anything that provides additional detail to the core of your material
-{: .callout}
-~~~
-{: .source}
-
-will be rendered like this:
-
-> ## Callout block example
->
-> Write callout blocks as blockquotes,
-> with a styling tag (techincal term is a _class identifier_) at the end.
->
-> ~~~
-> # you can still include code blocks in the callout
-> weight_lb = 2.2 * weight_kg
-> print(weight_kg_text, weight_kg, 'and in pounds:', weight_lb)
-> ~~~
-> {: .language-python}
->
-> Use callouts for asides and comments -
-> anything that provides additional detail to the core of your material
-{: .callout}
-
-Similarly, exercises written like this
-
-~~~
-> ## Sorting Out References
->
-> What does the following program print out?
->
-> ~~~
-> first, second = 'Grace', 'Hopper'
-> third, fourth = second, first
-> print(third, fourth)
+> def get_values():
+>     return 2
+>     yield 1
 > ~~~
 > {: .language-python}
 >
 > > ## Solution
 > >
-> > This text will only be visible if the solution is expanded
+> > The code defines a `generator` due to the use of `yield`.
+> > In Python versions prior to 3.3 the above would not be valid syntax.
+> > Using `return` is somewhat equivalent to raising `StopIteration`.
+> > The value `2` is accessible only by inspecting the `StopIteration` exception.
 > > ~~~
-> > Hopper Grace
+> > gen = get_values()
+> > try:
+> >     next(gen)
+> > except StopIteration as e:
+> >     print("StopIteration has value", e.value)
 > > ~~~
 > > {: .output}
 > {: .solution}
 {: .challenge}
-~~~
-{: .source}
-
-will be rendered like this (note the expandable box containing the solution):
-
-> ## Sorting Out References
->
-> What does the following program print out?
->
-> ~~~
-> first, second = 'Grace', 'Hopper'
-> third, fourth = second, first
-> print(third, fourth)
-> ~~~
-> {: .language-python}
->
-> > ## Solution
-> >
-> > This text will only be visible if the solution is expanded
-> > ~~~
-> > Hopper Grace
-> > ~~~
-> > {: .output}
-> {: .solution}
-{: .challenge}
-
-## Shared link references
-
- Lastly, the last line in every `.md` file for each page should be
-
-{% raw %}
-`{% include links.md %}`
-{% endraw %}

- This allows us to share link references across the entire site, which makes the links much more maintainable.
-  - link URLs should be put in the `_includes/links.md` file (ideally, arranged alphabetically by reference)
-  - you can then write Markdown links "reference-style" i.e. `[link text to be displayed][reference-id]`, with `[reference-id]: https://link.to.page` in `_includes/links.md`

 {% include links.md %}
--- a/_episodes/02-data.md
+++ b/_episodes/02-data.md
@@ -35,13 +35,169 @@ keypoints:
    - include an aside about I/O - reading/writing files (pandas (the `.to_*()` methods and highlight some: `csv`, `json`, `feather`, `hdf`), numpy, `open()`, (?) bytes vs strings, (?) encoding)
  - Finish with example of `df.plot()` to set the scene for plotting section

+## Numpy
+
+## Reading data to a numpy array
+
+We'll use the popular image analysis package scikit-image,
+to read two example images into numpy arrays.
+
+~~~
+from skimage.io import imread
+raw = imread('cilliated_cell.png')
+nuclei = imread('cilliated_cell_nuclei.png')
+# if you want to see what these images look like - we can use matplotlib (more to come later!)
+import matplotlib.pyplot as plt
+plt.imshow(raw, cmap='gray')
+plt.imshow(nuclei)
+~~~
+{: .language-python }
+
+> ## Exploring Image Arrays
+>
+> * What are the dimensions of these arrays?
+> * What data type are these arrays?
+> * What is the minimum and maximum value of these arrays?
+>
+> > ## Solution
+> > ~~~
+> > print(raw.shape)
+> > print(raw.dtype)
+> > print(np.max(raw))
+> > print(np.min(raw))
+> > ~~~
+> > {: .language-python }
+> {: .solution }
+{: .challenge }
+
+> ## Masking arrays
+>
+> The nuclei image contains a binary segmentation i.e.:
+>
+> * 1 = nuclei
+> * 0 = not nuclei
+>
+> 1. Find the median value of the raw image within the nuclei
+> 2. Create a new version of raw where all values outside the nuclei are 0
+>
+> > ## Solution
+> > ~~~
+> > # 1
+> > pixels_in_nuclei = raw[nuclei == 1]
+> > print(np.median(pixels_in_nuclei))
+> >
+> > # 2
+> > new_image = raw.copy()
+> > new_image[nuclei == 0] = 0
+> > plt.imshow(new_image, cmap='gray')
+> > ~~~
+> > {: .language-python }
+> {: .solution }
+{: .challenge }
+
+
+## Pandas
+
+- load data
+
+~~~
+import pandas as pd
+covid_cases = pd.read_csv("data/CovidCaseData_20200624.csv")
+~~~
+{: .language-python }
+
+- display dataframe
+    - `head` & `tail`
+
+    ~~~
+    covid_cases.head()
+    covid_cases.tail()
+    ~~~
+    {: .language-python }
+
+    - `shape`
+
+    ~~~
+    print(f'covid_cases is a {type(covid_cases)} object with {covid_cases.shape[0]} rows and {covid_cases.shape[1]} columns')
+    print('covid_cases has the following columns:\n' + '\n'.join(covid_cases.columns))
+    ~~~
+    {: .language-python }
+
+- much easier to use `covid_cases.info()`
+- get statistics on quantitative columns with `.describe()`
+  - sometimes this information isn't helpful (e.g. mean of year column)
+  - but good to see we don't have any missing data (count is identical for all columns)
+
+### Selecting Data in a Dataframe
+
+- use `iloc` & `loc` to select specific cell(s) from a dataframe
+
+~~~
+print(covid_cases.iloc[24242,4])
+print(covid_cases.iloc[24242,])
+print(covid_cases.iloc[24242,:])
+~~~
+{: .language-python }
+
+- you can use slices with `iloc` & `loc`
+
+~~~
+print(covid_cases.iloc[100:120,4:6])
+print(covid_cases.iloc[100:120,:])
+
+# select a whole row
+print(covid_cases.loc[0,:])
+# or
+print(covid_cases.loc[0,])
+
+# select a whole column
+print(covid_cases.loc[:,'continentExp'])
+# or
+print(covid_cases['continentExp'])
+# or(!)
+print(covid_cases.continentExp)
+~~~
+{: .language-python }
+
+- columns are returned as a pandas Series object
+
+~~~
+type(covid_cases['continentExp'])
+~~~
+{: .language-python }
+
+- get the unique values of a series (as a numpy array):
+
+~~~
+pd.unique(covid_cases['continentExp'])
+~~~
+{: .language-python }
+
+### Filtering Data in a Dataframe
+
+- you've seen indexing, but with large datasets you can't easily explore, it's much more useful to select/filter data based on value
+
+~~~
+covid_cases['continentExp'] == 'Europe'
+~~~
+{: .language-python }
+
+- INSERT EXERCISE from !11 here
+
+- results of filtering can be used in further operations
+
+~~~
+covid_cases[covid_cases['continentExp'] == 'Europe'].max()
+~~~
+{: .language-python }
+
 > ## Working with Filtered Data
 >
 > 1. On what date were the most cases reported in Germany so far?
 > 2. What was the mean number of cases reported per day in Germany in April 2020?
 > 3. Is this higher or lower than the mean for March 2020?
 > 4. On how many days in March was the number of cases in Germany higher than the mean for April?
-> 
+>
 > > ## Solution
 > > ~~~
 > > # 1
@@ -71,4 +227,166 @@ keypoints:
 > {: .solution }
 {: .challenge }

+### Combining Dataframes
+
+- where you have unique values in a column, you can set this as the 'index'
+
+~~~
+asia_lockdowns = pd.read_csv('data/AsiaLockdowns.csv', index_col=0)
+africa_lockdowns = pd.read_csv('data/AfricaLockdowns.csv', index_col=0)
+~~~
+{: .language-python }
+
+- rows from two dataframes with the same columns can be easily stacked into a single df with `pd.concat`
+
+- INSERT EXERCISE from !21 here
+
+- INSERT EXERCISE from !17 here
+
+- there are blanks in the lockdown end date column - probably because these lockdowns haven't ended yet(?).
+- let's try to set the end date to the latest date included in the case data
+
+~~~
+latest_date = covid_cases['dateRep'].max()
+print(latest_date)
+~~~
+{: .language-python }
+
+- whoa! something is wrong here!
+- check the type of the data in the `"dateRep"` column
+
+~~~
+print(covid_cases['dateRep'].dtype)
+~~~
+{: .language-python }
+
+- these values are being treated (and therefore sorted) as strings!
+
+#### Working with Datetime Columns
+
+- luckily, pandas provides an easy way to convert these to a datetime type, which can be properly handled
+
+~~~
+pd.to_datetime(covid_cases['dateRep'], dayfirst=True)
+# dayfirst=True is necessary because by default pandas reads mm/dd/yyyy dates :(
+covid_cases['dateRep'] = pd.to_datetime(covid_cases['dateRep'], dayfirst=True)
+print(covid_cases['dateRep'].max())
+~~~
+{: .language-python }
+
+- much better!
+- now we can fill in those blank lockdown end dates
+
+~~~
+covid_lockdowns['End date'] = covid_lockdowns['End date'].
+                                fillna(covid_cases['dateRep'].max())
+~~~
+{: .language-python }
+
+- INSERT EXERCISE from !22 here
+- we saw earlier how to add rows to a df with `concat`
+- you can also add additional columns to a dataframe
+  - treat it like a dictionary
+
+~~~
+covid_cases['casesPerMillion'] = covid_cases['cases'] / (covid_cases['popData2019']/1e6)
+covid_cases.head()
+~~~
+{: .language-python }
+
+- and you can add data from another dataframe
+  - but need to make the key column names match up first!
+
+~~~
+covid_lockdowns.index.name='countriesAndTerritories'
+covid_cases.merge(covid_lockdowns, on="countriesAndTerritories")
+~~~
+{: .language-python }
+
+- For more on combining series and dataframes, we recommend these two chapters of Jake Vanderplas' _Python Data Science Handbook_:
+  - [Combining Datasets: Concat and Append](https://jakevdp.github.io/PythonDataScienceHandbook/03.06-concat-and-append.html)
+  - [Combining Datasets: Merge and Join](https://jakevdp.github.io/PythonDataScienceHandbook/03.07-merge-and-join.html)
+
+~~~
+combined = covid_cases.merge(covid_lockdowns, on="countriesAndTerritories")
+~~~
+{: .language-python }
+
+- INSERT EXERCISE from !24 here
+
+### Groupby & Split-Apply-Combine
+
+- we'll keep working with the inner-joined `combined` df
+- as a demonstration of the power of pandas, let's calculate the average time between the beginning of a country's lockdown, and the peak in new cases in that country
+  - do this with a rolling average of cases, to make the method less sensitive to noise in data
+- the dataframe is currently sorted in reverse date order
+- before we can calculate a rolling mean, we'll need to re-sort the data from longest-ago to most-recent:
+
+~~~
+combined = combined.sort_values(by=['countriesAndTerritories','dateRep'])
+~~~
+{: .language-python }
+
+- now we can calculate our rolling mean of cases for each country
+- this uses the split-apply-combine paradigm:
+  - first grouping by country,
+  - then calculating the rolling mean
+  - we get back a single Series, indexed by country
+
+~~~
+rolling_mean_cases = combined.groupby('countriesAndTerritories')['cases'].rolling(7).mean()
+~~~
+{: .language-python }
+
+- unfortunately, the index of our rolling mean series is incompatible with our main df, so we can't add it as a new column
+
+~~~
+combined['rolling mean'] = rolling_mean_cases
+~~~
+{: .language-python }
+
+- we can use the `reset_index` method to remove the `"countriesAndTerritories"` index
+
+~~~
+rolling_mean_cases = rolling_mean_cases.reset_index(0, drop=True)
+~~~
+{: .language-python }
+
+- and now we can add the rolling means as a new column
+
+~~~
+combined['rolling mean'] = rolling_mean_cases
+
+# plot rolling average for Germany
+combined[combined['countriesAndTerritories']=='Germany'].set_index('dateRep')['rolling mean'].plot(kind='line')
+# plot cumulative sum of cases for Germany
+combined[combined['countriesAndTerritories']=='Germany'].set_index('dateRep')['cases'].cumsum().plot(kind='line')
+~~~
+{: .language-python }
+
+- now we want to calculate the time difference between the start of lockdown and the peak of rolling mean cases in each country
+- INSERT EXERCISE from !23 here
+- set the index so we can compare the dates:
+
+~~~
+peak_dates.index = combined.loc[peak_rows]['countriesAndTerritories']
+start_dates.index = combined.loc[peak_rows]['countriesAndTerritories']
+print((peak_dates - start_dates).median())
+~~~
+{: .language-python }
+
+- as a final flourish, let's create a boxplot showing the distribution of these time differences:
+- because `plot(kind="boxplot")` doesn't know how to work with datetime values, we need to extract the number of days as an integer.
+- can do this with the `days()` method of the datetime object, but have to write a function to get it so we can use `.apply`...
+
+~~~
+def get_days(t):
+    return t.days
+
+(peak_dates - start_dates).apply(get_days).plot(kind='box')
+~~~
+{: .language-python }
+
+![](/fig/boxplot.png)
+
 {% include links.md %}
--- a/_episodes/04-argparse.md
+++ b/_episodes/04-argparse.md
@@ -73,6 +73,33 @@ keypoints:
 > Pair up and discuss with a partner.
 {: .challenge}

+> ## Options & Arguments Revisited
+> In an earlier exercise,
+> you filled in the blanks in code designed to
+> parse options and arguments from the command line using `sys.argv`.
+> Use what you've learned about `argparse` to write a parser to replace that
+> code.
+>
+> As a reminder, after the changes you made earlier,
+> the program should take
+> **one (and _only_ one) of three modes** (`--sum`, `--product`, `--mean`)
+> to determine which function will be used in the body of the script,
+> **an arbitrary number of input files**,
+> and, **optionally, an output file name**.
+> Also include the possibility to provide shorthand, single-letter flags
+> (`-s`, `-p`, and `-m`)
+> for each of the modes.
+>
+> Once you've got this parser up and running, try it out.
+> Start by running the script with only the `-h` flag,
+> then try providing the expected user input,
+> before finally providing some incorrect/incompatible options and/or arguments
+> and observe the result in each case.
+> Do you think this is better than the approach we used before?
+> If so, do you consider the improvement sufficient to be worth the effort?
+>
+{: .challenge }
+
 > ## Carry the Zero
 >
 > What does `sys.argv[0]` return when you run it in a script?

--- a/_episodes/05-style.md
+++ b/_episodes/05-style.md
@@ -36,6 +36,62 @@ keypoints:
  - be careful with cell order
  - clear output before saving

+- In _Comments_ section: address what a good comment looks like
+  - from Renato: "useful comments should answer the _why_ question. Sometimes answering the _what_ is also useful. e.g. explaining what that number 10 that dropped out of nowhere is. The _how_ is usually something you get by reading the code."
+
+> ## Words Between the Lines of Age
+>
+> Given an input string and a substring length,
+> the function `count_frequencies` returns counts of substrings
+> found in the input string.
+> An additional parameter is used to determine whether the counted
+> substrings should overlap or not.
+>
+> ~~~
+> from collections import defaultdict
+>
+> def count_frequencies(s, w, o):
+>     counts = defaultdict(int)
+>     if o:
+>         step = 1
+>     else:
+>         step = w
+>     for i in range(0, len(s), step):
+>         word = s[i:i+w]
+>         if len(word) < w:
+>             return counts
+>         counts[word] += 1
+>     return counts
+> ~~~
+> {: .language-python }
+> You are allowed to make only one of the following changes to the function:
+>
+> 1. Rename the input variables `s`, `w`, and `o`
+> 2. Add a docstring to the function
+> 3. Set default values for one or more of the input parameters
+> 4. Insert comments to annotate the function definition
+>
+> Which do you think is the most important change to make?
+> Pair up and explain your choice to your partner.
+> Did you both make the same choice?
+> If not, did you find your partner's justification convincing?
+>
+> > ## Solution
+> > Here are some suggested benefits for each choice:
+> >
+> > 1. Renaming the variables, perhaps to something like `input_string`, `word_length`, and `overlapping`, would make the purpose (and, in two cases at least, the expected type) of each of the function's arguments much clearer. It is much more likely that someone seeing this function for the first time would understand what was going on with variable names like these.
+> > 2. Adding a comprehensive docstring for the function would allow users to check how to use it via Python's built-in `help`, or via various features of their chosen IDE (e.g. Jupyter users could type `count_frequencies?` and get a pop-up usage message). You can write as much detail as you like into the docstring, so this is probably the most comprehensive option!
+> > 3. Default values can help make the purpose of the functions' arguments clearer, e.g. `o=True` tells the user that this option is a logical switch. Furthermore, if all but one of a function's arguments has a default value, and the user knows what kind of object the function expects to operate on, it's less likely they will need to check the documentation/signature of the function. (Which, in this case, would be A Good Thing because it wouldn't be much help!)
+> > 4. Comments can be really helpful for someone reading the source code of the function, to help them understand what each line is doing. You're also not limited in how many comments you can insert, so this option would allow you to be very thorough! However, these comments aren't visible to anyone trying to work with your function e.g. calling `help(count_frequencies)` after loading it from a module, so it's not the most user-friendly option.
+> > All the options given are valid choices and, thankfully, in reality
+> > you're unlikely to ever need to choose between them!
+> > The authors would choose option 1
+> > because good, self-explanatory variable names go a long way to
+> > making your code _self-documenting_.
+> >
+> {: .solution }
+{: .challenge}
+
 > ## Fashions change. Style is forever.
 > Look at the following three code blocks. (Based on [this script][matplotlib-hinton] from the Matplotlib Example Gallery).
 >
@@ -113,4 +169,162 @@ keypoints:
 > - Pair up and compare your notes with a partner's. Did you both identify the same problems?
 {: .challenge }

+> ## Comparing Different Code Checkers
+> Analyse the code below with `pycodestyle`, `pylint`, and `pyflakes`,
+> using default settings in each case.
+> (You can download the script [here](code/anagrams.py).)
+>
+> ~~~
+> def find_anagrams(words,ignore_case = True) :
+>     anagrams = []
+>     if ignore_case == True:
+>         charsets = [ set(w.lower()) for w in words ]
+>         return [w for w in words if charsets.count(set(w.lower())) > 1]
+>     else:
+>         charsets = [ set(w) for w in words ]
+>         return [w for w in words if charsets.count(set(w)) > 1]
+>
+> test_words = "back to the time when trouble was not always on our minds No mite item".split()
+>
+> print(find_anagrams(test_words, ignore_case=False))
+> ~~~
+> {: .language-python }
+>
+> What differences do you notice in the output?
+> How many different types of problem do these tools find in the code?
+> Use the information provided by these tools to guide you while you fix our code.
+> Is there anything that you think could be improved in the code,
+> which wasn't picked up by any of the code-checking tools used above?
+>
+> > ## Solution
+> >
+> > The expected output of each tool is included below:
+> >
+> > ~~~
+> > $ pycodestyle anagrams.py
+> > ~~~
+> > {: .language-bash }
+> > ~~~
+> > anagrams.py:1:24: E231 missing whitespace after ','
+> > anagrams.py:1:36: E251 unexpected spaces around keyword / parameter equals
+> > anagrams.py:1:38: E251 unexpected spaces around keyword / parameter equals
+> > anagrams.py:1:44: E203 whitespace before ':'
+> > anagrams.py:3:20: E712 comparison to True should be 'if cond is True:' or 'if cond:'
+> > anagrams.py:4:21: E201 whitespace after '['
+> > anagrams.py:4:51: E202 whitespace before ']'
+> > anagrams.py:7:21: E201 whitespace after '['
+> > anagrams.py:7:43: E202 whitespace before ']'
+> > anagrams.py:10:1: E305 expected 2 blank lines after class or function definition, found 1
+> > anagrams.py:10:80: E501 line too long (93 > 79 characters)
+> > ~~~
+> > {: .output }
+> >
+> > ~~~
+> > $ pylint anagrams.py
+> > ~~~
+> > {: .language-bash }
+> > ~~~
+> > ************* Module anagrams
+> > anagrams.py:1:23: C0326: Exactly one space required after comma
+> > def find_anagrams(words,ignore_case = True) :
+> >                        ^ (bad-whitespace)
+> > anagrams.py:1:36: C0326: No space allowed around keyword argument assignment
+> > def find_anagrams(words,ignore_case = True) :
+> >                                     ^ (bad-whitespace)
+> > anagrams.py:1:44: C0326: No space allowed before :
+> > def find_anagrams(words,ignore_case = True) :
+> >                                             ^ (bad-whitespace)
+> > anagrams.py:4:19: C0326: No space allowed after bracket
+> >         charsets = [ set(w.lower()) for w in words ]
+> >                    ^ (bad-whitespace)
+> > anagrams.py:4:51: C0326: No space allowed before bracket
+> >         charsets = [ set(w.lower()) for w in words ]
+> >                                                    ^ (bad-whitespace)
+> > anagrams.py:7:19: C0326: No space allowed after bracket
+> >         charsets = [ set(w) for w in words ]
+> >                    ^ (bad-whitespace)
+> > anagrams.py:7:43: C0326: No space allowed before bracket
+> >         charsets = [ set(w) for w in words ]
+> >                                            ^ (bad-whitespace)
+> > anagrams.py:1:0: C0114: Missing module docstring (missing-module-docstring)
+> > anagrams.py:1:0: C0116: Missing function or method docstring (missing-function-docstring)
+> > anagrams.py:3:4: R1705: Unnecessary "else" after "return" (no-else-return)
+> > anagrams.py:3:7: C0121: Comparison to True should be just 'expr' (singleton-comparison)
+> > anagrams.py:2:4: W0612: Unused variable 'anagrams' (unused-variable)
+> >
+> > --------------------------------------------------------------------
+> > Your code has been rated at -3.33/10
+> > ~~~
+> > {: .output }
+> >
+> > ~~~
+> > $ pyflakes anagrams.py
+> > ~~~
+> > {: .language-bash }
+> > ~~~
+> > anagrams.py:2: local variable 'anagrams' is assigned to but never used
+> > ~~~
+> > {: .output }
+> >
+> > `pylint` provides the most comprehensive list of issues with the example code, including a warning about the unused variable, as well as flagging up the issues involving whitespace and redundancy in the comparison with `True` that `pycodestyle` identified. Unlike either of the other two tools, `pylint` also noticed that the function didn't include a docstring.
+> >
+> > An issue that wasn't identified by any of the three code checkers is
+> > the repetition in the `charsets = [...]` lines.
+> >
+> > ~~~
+> >         charsets = [ set(w.lower()) for w in words ] # <---
+> >         return [w for w in words if charsets.count(set(w.lower())) > 1]
+> >     else:
+> >         charsets = [ set(w) for w in words ]         # <---
+> > ~~~
+> > {: .language-python }
+> >
+> > One rule of good coding is _Don't Repeat Yourself (DRY)_:
+> > multiple lines containing (almost) identical code are
+> > usually a sign of an inefficient program.
+> >
+> > A cleaned-up version of our program,
+> > addressing all of the problems identified above,
+> > might look like this:
+> >
+> > ~~~
+> > '''
+> > A module of tools for finding anagrams in lists of strings.
+> > '''
+> >
+> > def find_anagrams(words, ignore_case=True):
+> >     '''
+> >     Find all anagrams within a list of strings.
+> >
+> >     Parameters:
+> >         words:          a list of strings to be filtered
+> >         ignore_case:    if True, treat equivalent characters
+> >                         in upper and lowercase (e.g. 'A' and 'a')
+> >                         as matching. (default: True)
+> >
+> >     Returns:
+> >         If any anagrams were found, a list containing those strings.
+> >         Otherwise, an empty list.
+> >     '''
+> >     if ignore_case:
+> >         text_transform = str.lower
+> >     else:
+> >         text_transform = str
+> >
+> >     charsets = [set(text_transform(w)) for w in words]
+> >     return [w for w in words if charsets.count(set(text_transform(w))) > 1]
+> >
+> >
+> > test_words = """
+> > back to the time
+> > when trouble was
+> > not always on our
+> > minds No mite item""".split()
+> >
+> > print(find_anagrams(test_words, ignore_case=False))
+> > ~~~
+> > {: .language-python }
+> {: .solution }
+{: .challenge }
+
 {% include links.md %}
--- a/_includes/favicons.html
+++ b/_includes/favicons.html
@@ -8,6 +8,8 @@
 {% assign carpentry = 'Library Carpentry' %}
 {% elsif site.carpentry == 'cp' %}
 {% assign carpentry = 'The Carpentries' %}
+{% elsif site.carpentry == "bio-it" %}
+{% assign carpentry = 'EMBL Bio-IT' %}
 {% endif %}

    <!-- Favicons for everyone -->

--- a/_includes/lesson_footer.html
+++ b/_includes/lesson_footer.html
@@ -25,6 +25,9 @@
 	{% elsif site.carpentry == "cp" %}
 	Licensed under <a href="{{ site.cc_by_human }}">CC-BY 4.0</a> 2018–{{ 'now' | date: "%Y" }}
 	by <a href="{{ site.carpentries_site }}">The Carpentries</a>
+  {% elsif site.carpentry == "bio-it" %}
+  Licensed under <a href="{{ site.cc_by_human }}">CC-BY 4.0</a> 2020–{{ 'now' | date: "%Y" }}
+	by <a href="{{ site.bio-it-home }}">EMBL Bio-IT</a>
 	{% endif %}
    </div>
    <div class="col-md-6 help-links" align="right">
@@ -47,7 +50,7 @@
  </div>
  <div class="row">
    <div class="col-md-12" align="center">
-      Using <a href="https://github.com/carpentries/styles/">The Carpentries style</a>
+      Template adapted from <a href="https://github.com/carpentries/styles/">The Carpentries style</a>
      version <a href="https://github.com/carpentries/styles/releases/tag/v9.5.3">9.5.3</a>.
    </div>
  </div>

--- a/_includes/navbar.html
+++ b/_includes/navbar.html
@@ -32,6 +32,10 @@
      <a href="{{ site.carpentries_site }}" class="pull-left">
        <img class="navbar-logo" src="{{ relative_root_path }}{% link /assets/img/cp-logo-blue.svg %}" alt="The Carpentries logo" />
      </a>
+      {% elsif site.carpentry == "bio-it" %}
+      <a href="{{ site.bio-it-home }}" class="pull-left">
+        <img class="navbar-logo" src="{{ relative_root_path }}{% link /assets/img/Bio-IT-screen.svg %}" alt="Bio-IT logo" />
+      </a>
      {% endif %}

      {% comment %} Always show link to home page. {% endcomment %}

--- a/_includes/workshop_footer.html
+++ b/_includes/workshop_footer.html
@@ -14,12 +14,18 @@
 	<a href="{{ site.lc_site }}">Library Carpentry</a>
 	{% elsif site.carpentry == "cp" %}
 	<a href="{{ site.carpentries_site }}">The Carpentries</a>
+  {% elsif site.carpentry == "bio-it" %}
+  <a href="{{ site.bio-it-home }}">EMBL Bio-IT</a>
 	{% endif %}
      </h4>
    </div>
    <div class="col-md-6" align="right">
      <h4>
+  {% if site.carpentry == "bio-it" %}
+	<a href="mailto:{{ site.email }}">Contact EMBL Bio-IT</a>
+  {% else %}
 	<a href="mailto:{{ site.email }}">Contact The Carpentries</a>
+  {% endif %}
      </h4>
    </div>
  </div>

--- a/assets/img/Bio-IT-screen.svg
+++ b/assets/img/Bio-IT-screen.svg
--- a/assets/img/Bio-IT-screen_tower.svg
+++ b/assets/img/Bio-IT-screen_tower.svg
--- a/bin/lesson_check.py
+++ b/bin/lesson_check.py
@@ -179,7 +179,7 @@ def check_config(reporter, source_dir):
    reporter.check_field(config_file, 'configuration',
                         config, 'kind', 'lesson')
    reporter.check_field(config_file, 'configuration',
-                         config, 'carpentry', ('swc', 'dc', 'lc', 'cp'))
+                         config, 'carpentry', ('swc', 'dc', 'lc', 'cp', 'bio-it'))
    reporter.check_field(config_file, 'configuration', config, 'title')
    reporter.check_field(config_file, 'configuration', config, 'email')


--- a/code/anagrams.py
+++ b/code/anagrams.py
+def find_anagrams(words,ignore_case = True) :
+    anagrams = []
+    if ignore_case == True:
+        charsets = [ set(w.lower()) for w in words ]
+        return [w for w in words if charsets.count(set(w.lower())) > 1]
+    else:
+        charsets = [ set(w) for w in words ]
+        return [w for w in words if charsets.count(set(w)) > 1]
+
+test_words = "back to the time when trouble was not always on our minds No mite item".split()
+
+print(find_anagrams(test_words, ignore_case=False))
--- a/data/CovidLockdowns.csv
+++ b/data/CovidLockdowns.csv
@@ -8,7 +8,7 @@ Austria,2020-03-16,2020-04-13
 Azerbaijan,2020-03-31,2020-04-20
 Bangladesh,2020-03-26,2020-05-16
 Barbados,2020-03-28,2020-05-03
-Belgium,2020-03-18,2020-05-04 
+Belgium,2020-03-18,2020-05-04
 Bermuda,2020-04-04,2020-05-02
 Bolivia,2020-03-22,2020-04-15
 Botswana,2020-04-02,2020-04-30
@@ -25,7 +25,7 @@ Fiji,2020-03-20,2020-04-17
 Finland,2020-03-27,2020-04-16
 France,2020-03-17,2020-05-11
 Georgia,2020-03-31,2020-04-21
-Germany,2020-03-23,2020-04-20 to 2020-05-10
+Germany,2020-03-23,2020-05-10
 Ghana,2020-03-30,2020-04-12
 Greece,2020-03-23,2020-05-04
 Guernsey,2020-03-25,
@@ -39,7 +39,7 @@ Israel,2020-04-02,
 Italy,2020-03-09,2020-05-18
 Jamaica,2020-04-15,2020-04-22
 Jordan,2020-03-18,2020-04-30
-Kosovo,2020-03-14,2020-05-04 
+Kosovo,2020-03-14,2020-05-04
 Kuwait,2020-03-14,2020-03-29
 Lebanon,2020-03-15,2020-03-28
 Liberia,2020-03-23,2020-04-11
@@ -72,7 +72,7 @@ Rwanda,2020-03-21,2020-04-19
 Samoa,2020-03-26,2020-04-08
 San Marino,2020-03-14,2020-05-05
 Saudi Arabia,2020-03-09,
-Serbia,2020-03-15,2020-04-21 to 2020-05-04
+Serbia,2020-03-15,2020-05-04
 Singapore,2020-04-07,2020-06-01
 South Africa,2020-03-26,2020-04-30
 Spain,2020-03-14,2020-05-09

--- a/fig/boxplot.png
+++ b/fig/boxplot.png
No results found